• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

OCHA-DAP / hdx-analysis-scripts / 16405476424

20 Jul 2025 11:42PM UTC coverage: 87.921% (-0.5%) from 88.434%
16405476424

Pull #11

github

web-flow
Merge 4d1c8394d into 717938af7
Pull Request #11: HDXDSYS-2332 Investigate HDX Analysis Scripts org stats failure

626 of 712 relevant lines covered (87.92%)

0.88 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

98.9
/src/hdx/analysis_scripts/orgs/__main__.py
1
import argparse
1✔
2
import logging
1✔
3
import os
1✔
4
import re
1✔
5
from os import mkdir
1✔
6
from os.path import expanduser, join
1✔
7
from shutil import rmtree
1✔
8

9
from hdx.analysis_scripts.common import (
1✔
10
    get_aging,
11
    get_dataset_name_to_explorers,
12
    get_requests_mappings,
13
)
14
from hdx.analysis_scripts.common.dataset_statistics import DatasetStatistics
1✔
15
from hdx.analysis_scripts.common.downloads import Downloads
1✔
16
from hdx.api.configuration import Configuration
1✔
17
from hdx.facades.keyword_arguments import facade
1✔
18
from hdx.location.country import Country
1✔
19
from hdx.utilities.dateparse import default_date, now_utc
1✔
20
from hdx.utilities.dictandlist import dict_of_lists_add, write_list_to_csv
1✔
21
from hdx.utilities.path import script_dir_plus_file
1✔
22
from hdx.utilities.text import get_fraction_str
1✔
23

24
logger = logging.getLogger(__name__)
1✔
25

26
lookup = "hdx-analysis-scripts"
1✔
27

28
bracketed_date = re.compile(r"\((.*)\)")
1✔
29

30

31
def main(downloads, output_dir, **ignore):
1✔
32
    rmtree(output_dir, ignore_errors=True)
1✔
33
    mkdir(output_dir)
1✔
34

35
    configuration = Configuration.read()
1✔
36

37
    downloads.set_api_key(configuration.get_api_key())
1✔
38
    org_type_mapping = configuration["org_type_mapping"]
1✔
39
    org_stats_url = configuration["org_stats_url"]
1✔
40
    name_to_geospatiality, name_to_location = downloads.get_geospatiality_locations(
1✔
41
        org_stats_url
42
    )
43
    dataset_name_to_explorers = get_dataset_name_to_explorers(downloads)
1✔
44
    dataset_id_to_requests, organisation_name_to_requests = get_requests_mappings(
1✔
45
        downloads
46
    )
47
    last_modified_aging = get_aging(configuration["last_modified_aging"])
1✔
48
    end_date_aging = get_aging(configuration["end_date_aging"])
1✔
49
    dataset_3m_downloads = downloads.get_mixpanel_downloads(3)
1✔
50
    dataset_1y_downloads = downloads.get_mixpanel_downloads(12)
1✔
51
    logger.info("Obtaining organisations data")
1✔
52
    organisations = downloads.get_all_organisations()
1✔
53
    total_public = 0
1✔
54
    total_public_internal = 0
1✔
55
    total_public_external = 0
1✔
56
    total_updated_by_cod = 0
1✔
57
    total_updated_by_script = 0
1✔
58
    total_lm_fresh = 0
1✔
59
    total_lm_not_fresh = 0
1✔
60
    total_ed_uptodate = 0
1✔
61
    total_ed_outofdate = 0
1✔
62
    organisation_name_to_id = {}
1✔
63
    for organisation_id, organisation in organisations.items():
1✔
64
        organisation_name = organisation["name"]
1✔
65
        organisation_name_to_id[organisation_name] = organisation_id
1✔
66
        geospatiality = name_to_geospatiality.get(organisation_name, "")
1✔
67
        organisation["geospatiality"] = geospatiality
1✔
68
        organisation_location = name_to_location.get(organisation_name, "")
1✔
69
        organisation["location"] = organisation_location
1✔
70
        latitude, longitude = "", ""
1✔
71
        if organisation_location and len(organisation_location) == 3:
1✔
72
            country_info = Country.get_country_info_from_iso3(organisation_location)
1✔
73
            if country_info:
1✔
74
                latitude = country_info["#geo+lat"]
1✔
75
                longitude = country_info["#geo+lon"]
1✔
76
        organisation["latitude"] = latitude
1✔
77
        organisation["longitude"] = longitude
1✔
78
        admins = 0
1✔
79
        editors = 0
1✔
80
        members = 0
1✔
81
        for user in organisation["users"]:
1✔
82
            match user["capacity"]:
1✔
83
                case "admin":
1✔
84
                    admins += 1
1✔
85
                case "editor":
1✔
86
                    editors += 1
1✔
87
                case "member":
1✔
88
                    members += 1
1✔
89
                case x:
×
90
                    raise ValueError(f"Unknown capacity {x}!")
×
91
        organisation["number of admins"] = admins
1✔
92
        organisation["number of editors"] = editors
1✔
93
        organisation["number of members"] = members
1✔
94
        organisation["downloads last 90 days"] = 0
1✔
95
        organisation["downloads last 12 months"] = 0
1✔
96
        organisation["public datasets"] = 0
1✔
97
        organisation["requestable datasets"] = 0
1✔
98
        organisation["private datasets"] = 0
1✔
99
        organisation["archived datasets"] = 0
1✔
100
        organisation["public internal resources"] = 0
1✔
101
        organisation["public external resources"] = 0
1✔
102
        organisation["updated by cod script"] = 0
1✔
103
        organisation["formerly updated by cod script"] = 0
1✔
104
        organisation["updated by script"] = 0
1✔
105
        organisation["old updated by script"] = 0
1✔
106
        organisation["any updated last 3 months"] = "No"
1✔
107
        organisation["any public updated last 3 months"] = "No"
1✔
108
        organisation["any updated previous quarter"] = "No"
1✔
109
        organisation["any public updated previous quarter"] = "No"
1✔
110
        organisation["public live datasets"] = 0
1✔
111
        organisation["public ongoing datasets"] = 0
1✔
112
        organisation["lm fresh datasets"] = 0
1✔
113
        organisation["lm due datasets"] = 0
1✔
114
        organisation["lm overdue datasets"] = 0
1✔
115
        organisation["lm delinquent datasets"] = 0
1✔
116
        organisation["ed uptodate datasets"] = 0
1✔
117
        organisation["ed outofdate datasets"] = 0
1✔
118
        organisation["latest created dataset date"] = default_date
1✔
119
        organisation["latest scripted update date"] = default_date
1✔
120
        organisation["in explorer or grid"] = "No"
1✔
121
        organisation["closed"] = "Yes" if organisation["closed_organization"] else "No"
1✔
122

123
        new_requests = 0
1✔
124
        open_requests = 0
1✔
125
        archived_requests = 0
1✔
126
        shared_requests = 0
1✔
127
        denied_requests = 0
1✔
128
        for request in organisation_name_to_requests.get(organisation_name, []):
1✔
129
            if request["state"] == "new":
1✔
130
                new_requests += 1
1✔
131
            elif request["state"] == "open":
1✔
132
                open_requests += 1
1✔
133
            else:
134
                archived_requests += 1
1✔
135
                if request["data_shared"]:
1✔
136
                    shared_requests += 1
1✔
137
                elif request["rejected"]:
1✔
138
                    denied_requests += 1
1✔
139
        organisation["new requests"] = new_requests
1✔
140
        organisation["open requests"] = open_requests
1✔
141
        organisation["archived requests"] = archived_requests
1✔
142
        organisation["shared requests"] = shared_requests
1✔
143
        organisation["denied requests"] = denied_requests
1✔
144
        organisation["tags"] = set()
1✔
145
        organisation["has crisis"] = "N"
1✔
146
    outdated_lastmodifieds = {}
1✔
147
    for dataset in downloads.get_all_datasets():
1✔
148
        datasetstats = DatasetStatistics(
1✔
149
            downloads.today,
150
            dataset_name_to_explorers,
151
            dataset_id_to_requests,
152
            last_modified_aging,
153
            end_date_aging,
154
            dataset,
155
        )
156
        name = dataset["name"]
1✔
157
        organisation_id = dataset["organization"]["id"]
1✔
158
        organisation = organisations[organisation_id]
1✔
159
        is_public_not_requestable_archived = False
1✔
160
        if datasetstats.public == "N":
1✔
161
            organisation["private datasets"] += 1
1✔
162
            continue
1✔
163
        elif datasetstats.requestable == "Y":
1✔
164
            organisation["requestable datasets"] += 1
1✔
165
        elif datasetstats.archived == "Y":
1✔
166
            organisation["archived datasets"] += 1
1✔
167
        else:
168
            organisation["public datasets"] += 1
1✔
169
            total_public += 1
1✔
170
            is_public_not_requestable_archived = True
1✔
171
            organisation["public internal resources"] += datasetstats.internal_resources
1✔
172
            organisation["public external resources"] += datasetstats.external_resources
1✔
173
            total_public_internal += datasetstats.internal_resources
1✔
174
            total_public_external += datasetstats.external_resources
1✔
175

176
        downloads_last_3months = dataset_3m_downloads.get(dataset["id"], 0)
1✔
177
        organisation["downloads last 90 days"] += downloads_last_3months
1✔
178
        downloads_last_year = dataset_1y_downloads.get(dataset["id"], 0)
1✔
179
        organisation["downloads last 12 months"] += downloads_last_year
1✔
180
        if datasetstats.last_modified is None:
1✔
181
            continue
1✔
182
        if datasetstats.updated_last_3_months == "Y":
1✔
183
            organisation["any updated last 3 months"] = "Yes"
1✔
184
            if is_public_not_requestable_archived:
1✔
185
                organisation["any public updated last 3 months"] = "Yes"
1✔
186
        if datasetstats.updated_previous_qtr == "Y":
1✔
187
            organisation["any updated previous quarter"] = "Yes"
1✔
188
            if is_public_not_requestable_archived:
1✔
189
                organisation["any public updated previous quarter"] = "Yes"
1✔
190
        if is_public_not_requestable_archived:
1✔
191
            if datasetstats.live == "Y":
1✔
192
                organisation["public live datasets"] += 1
1✔
193
            if datasetstats.ongoing == "Y":
1✔
194
                organisation["public ongoing datasets"] += 1
1✔
195
        match datasetstats.last_modified_fresh:
1✔
196
            case "Fresh":
1✔
197
                organisation["lm fresh datasets"] += 1
1✔
198
                total_lm_fresh += 1
1✔
199
            case "Due":
1✔
200
                organisation["lm due datasets"] += 1
1✔
201
                total_lm_not_fresh += 1
1✔
202
            case "Overdue":
1✔
203
                organisation["lm overdue datasets"] += 1
1✔
204
                total_lm_not_fresh += 1
1✔
205
            case "Delinquent":
1✔
206
                organisation["lm delinquent datasets"] += 1
1✔
207
                total_lm_not_fresh += 1
1✔
208
        match datasetstats.end_date_uptodate:
1✔
209
            case "UpToDate":
1✔
210
                organisation["ed uptodate datasets"] += 1
1✔
211
                total_ed_uptodate += 1
1✔
212
            case "OutOfDate":
1✔
213
                organisation["ed outofdate datasets"] += 1
1✔
214
                total_ed_outofdate += 1
1✔
215
        if datasetstats.in_explorer_or_grid == "Y":
1✔
216
            organisation["in explorer or grid"] = "Yes"
1✔
217
        if (
1✔
218
            datasetstats.updated_by_cod_script == "Y"
219
            and is_public_not_requestable_archived
220
        ):
221
            organisation["updated by cod script"] += 1
1✔
222
            total_updated_by_cod += 1
1✔
223
        if (
1✔
224
            datasetstats.old_updated_by_cod_script == "Y"
225
            and is_public_not_requestable_archived
226
        ):
227
            organisation["formerly updated by cod script"] += 1
1✔
228
        if datasetstats.created > organisation["latest created dataset date"]:
1✔
229
            organisation["latest created dataset date"] = datasetstats.created
1✔
230
        if datasetstats.updated_by_script:
1✔
231
            if datasetstats.last_modified > organisation["latest scripted update date"]:
1✔
232
                organisation["latest scripted update date"] = datasetstats.last_modified
1✔
233
            if (
1✔
234
                datasetstats.updated_by_noncod_script == "Y"
235
                and is_public_not_requestable_archived
236
            ):
237
                organisation["updated by script"] += 1
1✔
238
                total_updated_by_script += 1
1✔
239
            if datasetstats.outdated_lastmodified == "Y":
1✔
240
                dict_of_lists_add(outdated_lastmodifieds, organisation["name"], name)
1✔
241
            if datasetstats.old_updated_by_noncod_script == "Y":
1✔
242
                organisation["old updated by script"] += 1
1✔
243
        datasetstats.add_tags_to_set(organisation["tags"])
1✔
244
        if datasetstats.crisis_tag:
1✔
245
            organisation["has crisis"] = "Y"
1✔
246

247
    headers = [
1✔
248
        "Organisation name",
249
        "Organisation title",
250
        "Organisation acronym",
251
        "Organisation id",
252
        "Organisation type",
253
        "Geospatiality",
254
        "Location",
255
        "Latitude",
256
        "Longitude",
257
        "Number of admins",
258
        "Number of editors",
259
        "Number of members",
260
        "Downloads last 90 days",
261
        "Downloads last 12 months",
262
        "Public datasets",
263
        "Requestable datasets",
264
        "Private datasets",
265
        "Archived datasets",
266
        "Public Internal Resources",
267
        "Public External Resources",
268
        "Public API (non-cod scripted)",
269
        "% of public API (non-cod scripted)",
270
        "Public cod scripted",
271
        "% of public cod scripted",
272
        "Public formerly cod scripted",
273
        "% of public formerly cod scripted",
274
        "Public previous scripted",
275
        "% of public previous scripted",
276
        "Public live",
277
        "% of public live",
278
        "Public ongoing",
279
        "% of public ongoing",
280
        "Followers",
281
        "Any updated last 3 months",
282
        "Any public updated last 3 months",
283
        "Any updated previous quarter",
284
        "Any public updated previous quarter",
285
        "Last modified fresh datasets",
286
        "Last modified due datasets",
287
        "Last modified overdue datasets",
288
        "Last modified delinquent datasets",
289
        "End date up to date datasets",
290
        "End date out of date datasets",
291
        "Latest created dataset date",
292
        "Latest scripted update date",
293
        "In explorer or grid",
294
        "Closed",
295
        "New requests",
296
        "Open requests",
297
        "Total archived requests",
298
        "Shared requests",
299
        "Denied requests",
300
        "Tags",
301
        "Has crisis",
302
    ]
303

304
    def get_number_percentage(organisation, key):
1✔
305
        number = organisation[key]
1✔
306
        if number == "":
1✔
307
            return "", ""
×
308
        percentage = get_fraction_str(
1✔
309
            number * 100,
310
            organisation["public datasets"],
311
            format="%.0f",
312
        )
313
        return number, percentage
1✔
314

315
    logger.info("Generating rows")
1✔
316
    rows = list()
1✔
317
    for organisation_name in sorted(organisation_name_to_id):
1✔
318
        organisation = organisations[organisation_name_to_id[organisation_name]]
1✔
319
        organisation_type = org_type_mapping[organisation["hdx_org_type"]]
1✔
320
        updated_by_cod_script, percentage_cod = get_number_percentage(
1✔
321
            organisation, "updated by cod script"
322
        )
323
        old_updated_by_cod_script, old_percentage_cod = get_number_percentage(
1✔
324
            organisation, "formerly updated by cod script"
325
        )
326
        updated_by_api, percentage_api = get_number_percentage(
1✔
327
            organisation, "updated by script"
328
        )
329
        old_updated_by_script, percentage_old_script = get_number_percentage(
1✔
330
            organisation, "old updated by script"
331
        )
332
        live_datasets, percentage_live = get_number_percentage(
1✔
333
            organisation, "public live datasets"
334
        )
335
        ongoing_datasets, percentage_ongoing = get_number_percentage(
1✔
336
            organisation, "public ongoing datasets"
337
        )
338

339
        latest_created_dataset_date = organisation["latest created dataset date"]
1✔
340
        if latest_created_dataset_date == default_date:
1✔
341
            latest_created_dataset_date = None
1✔
342
        else:
343
            latest_created_dataset_date = latest_created_dataset_date.date().isoformat()
1✔
344
        latest_scripted_update_date = organisation["latest scripted update date"]
1✔
345
        if latest_scripted_update_date == default_date:
1✔
346
            latest_scripted_update_date = None
1✔
347
        else:
348
            latest_scripted_update_date = latest_scripted_update_date.date().isoformat()
1✔
349
        row = [
1✔
350
            organisation_name,
351
            organisation["title"],
352
            organisation.get("org_acronym", ""),
353
            organisation["id"],
354
            organisation_type,
355
            organisation["geospatiality"],
356
            organisation["location"],
357
            organisation["latitude"],
358
            organisation["longitude"],
359
            organisation["number of admins"],
360
            organisation["number of editors"],
361
            organisation["number of members"],
362
            organisation["downloads last 90 days"],
363
            organisation["downloads last 12 months"],
364
            organisation["public datasets"],
365
            organisation["requestable datasets"],
366
            organisation["private datasets"],
367
            organisation["archived datasets"],
368
            organisation["public internal resources"],
369
            organisation["public external resources"],
370
            updated_by_api,
371
            percentage_api,
372
            updated_by_cod_script,
373
            percentage_cod,
374
            old_updated_by_cod_script,
375
            old_percentage_cod,
376
            old_updated_by_script,
377
            percentage_old_script,
378
            live_datasets,
379
            percentage_live,
380
            ongoing_datasets,
381
            percentage_ongoing,
382
            organisation["num_followers"],
383
            organisation["any updated last 3 months"],
384
            organisation["any public updated last 3 months"],
385
            organisation["any updated previous quarter"],
386
            organisation["any public updated previous quarter"],
387
            organisation["lm fresh datasets"],
388
            organisation["lm due datasets"],
389
            organisation["lm overdue datasets"],
390
            organisation["lm delinquent datasets"],
391
            organisation["ed uptodate datasets"],
392
            organisation["ed outofdate datasets"],
393
            latest_created_dataset_date,
394
            latest_scripted_update_date,
395
            organisation["in explorer or grid"],
396
            organisation["closed"],
397
            organisation["new requests"],
398
            organisation["open requests"],
399
            organisation["archived requests"],
400
            organisation["shared requests"],
401
            organisation["denied requests"],
402
            ",".join(sorted(organisation["tags"])),
403
            organisation["has crisis"],
404
        ]
405
        rows.append(row)
1✔
406
    if rows:
1✔
407
        filepath = join(output_dir, "org_stats.csv")
1✔
408
        logger.info(f"Writing rows to {filepath}")
1✔
409
        write_list_to_csv(filepath, rows, headers, encoding="utf-8")
1✔
410

411
    if outdated_lastmodifieds:
1✔
412
        message = ["updated_by_script is significantly after last_modified for:\n"]
1✔
413
        for organisation_name, dataset_names in outdated_lastmodifieds.items():
1✔
414
            message.append(f"organisation {organisation_name} with ")
1✔
415
            no_names = len(dataset_names)
1✔
416
            if no_names > 6:
1✔
417
                message.append(f"{no_names} datasets such as {dataset_names[0]}")
1✔
418
            else:
419
                message.append("datasets: ")
1✔
420
                for dataset_name in dataset_names:
1✔
421
                    message.append(f"{dataset_name} ")
1✔
422
            message.append("\n")
1✔
423
        logger.warning("".join(message))
1✔
424

425
    logger.info(
1✔
426
        f"Total public datasets (excluding requestable, archived) = {total_public}"
427
    )
428
    logger.info(f"Total public updated by cod script = {total_updated_by_cod}")
1✔
429
    logger.info(
1✔
430
        f"Total public updated by all other scripts = {total_updated_by_script}"
431
    )
432
    quarterly_api_okr = get_fraction_str(
1✔
433
        total_updated_by_script * 100,
434
        total_public,
435
        format="%.0f",
436
    )
437
    logger.info(f"Quarterly % API OKR = {quarterly_api_okr}")
1✔
438

439
    logger.info(f"Total fresh datasets (using last modified) = {total_lm_fresh}")
1✔
440
    logger.info(
1✔
441
        f"Total non-fresh datasets (using last modified) = {total_lm_not_fresh}"
442
    )
443
    quarterly_lm_fresh_okr = get_fraction_str(
1✔
444
        total_lm_fresh * 100,
445
        (total_lm_fresh + total_lm_not_fresh),
446
        format="%.0f",
447
    )
448
    logger.info(f"Quarterly % last modified fresh OKR = {quarterly_lm_fresh_okr}")
1✔
449

450
    logger.info(f"Total up to date datasets (using end date) = {total_ed_uptodate}")
1✔
451
    logger.info(f"Total out of date datasets (using end date) = {total_ed_outofdate}")
1✔
452
    quarterly_ed_uptodate_okr = get_fraction_str(
1✔
453
        total_ed_uptodate * 100,
454
        (total_ed_uptodate + total_ed_outofdate),
455
        format="%.0f",
456
    )
457
    logger.info(f"Quarterly % end date up to date OKR = {quarterly_ed_uptodate_okr}")
1✔
458
    filepath = join(output_dir, "total_stats.csv")
1✔
459
    logger.info(f"Writing totals to {filepath}")
1✔
460
    headers = [
1✔
461
        "Public - Request & Archive",
462
        "Public Internal Resources",
463
        "Public External Resources",
464
        "Updated by COD",
465
        "Updated by Script",
466
        "Quarterly % API OKR",
467
        "Last Modified Fresh",
468
        "Last Modified Not Fresh",
469
        "Quarterly % Last Modified Fresh OKR",
470
        "End Date Up to Date",
471
        "End Date Out Of Date",
472
        "Quarterly % End Date Up To Date OKR",
473
    ]
474
    rows = [
1✔
475
        [
476
            total_public,
477
            total_public_internal,
478
            total_public_external,
479
            total_updated_by_cod,
480
            total_updated_by_script,
481
            quarterly_api_okr,
482
            total_lm_fresh,
483
            total_lm_not_fresh,
484
            quarterly_lm_fresh_okr,
485
            total_ed_uptodate,
486
            total_ed_outofdate,
487
            quarterly_ed_uptodate_okr,
488
        ]
489
    ]
490
    write_list_to_csv(filepath, rows, headers, encoding="utf-8")
1✔
491
    return total_public, total_updated_by_cod, total_updated_by_script
1✔
492

493

494
if __name__ == "__main__":
495
    parser = argparse.ArgumentParser(description="Org Stats script")
496
    parser.add_argument("-od", "--output_dir", default="output", help="Output folder")
497
    parser.add_argument(
498
        "-sd", "--saved_dir", default=None, help="Dir for downloaded data"
499
    )
500
    args = parser.parse_args()
501
    home_folder = expanduser("~")
502
    today = now_utc()
503
    mixpanel_config_yaml = join(home_folder, ".mixpanel.yaml")
504
    downloads = Downloads(today, mixpanel_config_yaml, args.saved_dir)
505

506
    user_agent_config_path = join(home_folder, ".useragents.yaml")
507
    if not os.path.exists(user_agent_config_path):
508
        user_agent_config_path = join(home_folder, ".useragents.yml")
509
    facade(
510
        main,
511
        hdx_site="prod",
512
        user_agent_config_yaml=user_agent_config_path,
513
        user_agent_lookup=lookup,
514
        project_config_yaml=script_dir_plus_file(
515
            join("config", "project_configuration.yaml"), Downloads
516
        ),
517
        downloads=downloads,
518
        output_dir=args.output_dir,
519
    )
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc