• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

OCHA-DAP / hdx-analysis-scripts / 16405476424

20 Jul 2025 11:42PM UTC coverage: 87.921% (-0.5%) from 88.434%
16405476424

Pull #11

github

web-flow
Merge 4d1c8394d into 717938af7
Pull Request #11: HDXDSYS-2332 Investigate HDX Analysis Scripts org stats failure

626 of 712 relevant lines covered (87.92%)

0.88 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

96.96
/src/hdx/analysis_scripts/common/dataset_statistics.py
1
import logging
1✔
2
import re
1✔
3
from collections import UserDict
1✔
4
from datetime import datetime, timedelta
1✔
5

6
from dateutil.parser import ParserError
1✔
7
from dateutil.relativedelta import relativedelta
1✔
8

9
from hdx.analysis_scripts.common import get_previous_quarter
1✔
10
from hdx.api.configuration import Configuration
1✔
11
from hdx.utilities.dateparse import parse_date
1✔
12

13
logger = logging.getLogger(__name__)
1✔
14

15

16
class DatasetStatistics(UserDict):
1✔
17
    bracketed_date = re.compile(r"\((.*)\)")
1✔
18

19
    def __init__(
1✔
20
        self,
21
        today,
22
        dataset_name_to_explorers,
23
        dataset_id_to_requests,
24
        last_modified_aging,
25
        end_date_aging,
26
        dataset,
27
    ):
28
        super().__init__(dataset.data)
1✔
29
        self.today = today
1✔
30
        self.last_3_months = today - relativedelta(months=3)
1✔
31
        self.previous_quarter = get_previous_quarter(today)
1✔
32
        self.dataset_name_to_explorers = dataset_name_to_explorers
1✔
33
        self.dataset_id_to_requests = dataset_id_to_requests
1✔
34
        self.last_modified_aging = last_modified_aging
1✔
35
        self.end_date_aging = end_date_aging
1✔
36
        self.dataset = dataset
1✔
37
        self.last_modified = None
1✔
38
        self.configuration = Configuration.read()
1✔
39
        self.get_status()
1✔
40
        self.get_cod()
1✔
41
        self.get_date_info()
1✔
42
        self.get_update_frequency_info()
1✔
43
        self.get_in_explorer_or_grid()
1✔
44
        self.get_requests()
1✔
45
        self.crisis_tag = False
1✔
46
        self.get_tags()
1✔
47
        self.get_updated_by_script()
1✔
48
        self.get_last_modified_freshness()
1✔
49
        self.get_end_date_freshness()
1✔
50

51
    def get_status(self):
1✔
52
        self.public = "N" if self["private"] else "Y"
1✔
53
        self.internal_resources = 0
1✔
54
        self.external_resources = 0
1✔
55
        self.data_link = ""
1✔
56
        self.data_type = ""
1✔
57
        requestable = self.dataset.is_requestable()
1✔
58
        if requestable:
1✔
59
            self.requestable = "Y"
1✔
60
        else:
61
            self.requestable = "N"
1✔
62
            resources = self.dataset.get_resources()
1✔
63
            if resources:
1✔
64
                resource = resources[0]
1✔
65
                self.data_link = resource["url"]
1✔
66
                self.data_type = resource["url_type"]
1✔
67
                for resource in resources:
1✔
68
                    if resource["url_type"] == "api":
1✔
69
                        self.external_resources += 1
1✔
70
                    else:
71
                        self.internal_resources += 1
1✔
72
        self.archived = "Y" if self["archived"] else "N"
1✔
73
        if self.public == "N" or self.requestable == "Y" or self.archived == "Y":
1✔
74
            self.exclude_from_stats = "Y"
1✔
75
        else:
76
            self.exclude_from_stats = "N"
1✔
77

78
    def get_cod(self):
1✔
79
        cod_level = self.get("cod_level")
1✔
80
        if cod_level:
1✔
81
            self.is_cod = "Y"
1✔
82
        else:
83
            self.is_cod = "N"
1✔
84

85
    def get_date_info(self):
1✔
86
        self.created = parse_date(self["metadata_created"], include_microseconds=True)
1✔
87
        try:
1✔
88
            time_period = self.dataset.get_time_period()
1✔
89
        except ParserError:
1✔
90
            time_period = None
1✔
91
        if time_period:
1✔
92
            self.startdate = time_period["startdate_str"]
1✔
93
            if time_period["ongoing"]:
1✔
94
                self.enddate = "ongoing"
1✔
95
            else:
96
                self.enddate = time_period["enddate_str"]
1✔
97
        else:
98
            self.startdate = ""
1✔
99
            self.enddate = ""
1✔
100
            logger.error(f"Dataset {self['name']} has no time period!")
1✔
101
        last_modified = self.get("last_modified")
1✔
102
        if not last_modified:
1✔
103
            logger.error(f"Dataset {self['name']} has no last modified field!")
1✔
104
            self.last_modified = None
1✔
105
            self.updated_last_3_months = ""
1✔
106
            return
1✔
107
        self.last_modified = parse_date(last_modified, include_microseconds=True)
1✔
108
        if self.last_3_months < self.last_modified <= self.today:
1✔
109
            self.updated_last_3_months = "Y"
1✔
110
        else:
111
            self.updated_last_3_months = "N"
1✔
112
        if self.previous_quarter[0] <= self.last_modified <= self.previous_quarter[1]:
1✔
113
            self.updated_previous_qtr = "Y"
1✔
114
        else:
115
            self.updated_previous_qtr = "N"
1✔
116

117
    def get_update_frequency_info(self):
1✔
118
        self.update_frequency = self.get("data_update_frequency", "")
1✔
119
        update_frequency = self.dataset.get_expected_update_frequency()
1✔
120
        if update_frequency == "Live":
1✔
121
            self.live = "Y"
1✔
122
        else:
123
            self.live = "N"
1✔
124
        try:
1✔
125
            time_period = self.dataset.get_time_period()
1✔
126
        except ParserError:
1✔
127
            time_period = None
1✔
128
        if time_period:
1✔
129
            if time_period["ongoing"]:
1✔
130
                self.ongoing = "Y"
1✔
131
            else:
132
                self.ongoing = "N"
1✔
133
        else:
134
            self.ongoing = ""
1✔
135

136
    def get_in_explorer_or_grid(self):
1✔
137
        if self["name"] in self.dataset_name_to_explorers:
1✔
138
            self.in_explorer_or_grid = "Y"
1✔
139
        else:
140
            self.in_explorer_or_grid = "N"
1✔
141

142
    def get_requests(self):
1✔
143
        self.new_requests = 0
1✔
144
        self.open_requests = 0
1✔
145
        self.archived_requests = 0
1✔
146
        self.shared_requests = 0
1✔
147
        self.denied_requests = 0
1✔
148
        for request in self.dataset_id_to_requests.get(self["id"], []):
1✔
149
            if request["state"] == "new":
1✔
150
                self.new_requests += 1
1✔
151
            elif request["state"] == "open":
1✔
152
                self.open_requests += 1
1✔
153
            else:
154
                self.archived_requests += 1
1✔
155
                if request["data_shared"]:
1✔
156
                    self.shared_requests += 1
1✔
157
                elif request["rejected"]:
1✔
158
                    self.denied_requests += 1
1✔
159

160
    def get_tags(self):
1✔
161
        tags = self.dataset.get_tags()
1✔
162
        self.tags = ", ".join(tags)
1✔
163
        for tag in tags:
1✔
164
            if tag[:7] == "crisis-":
1✔
165
                self.crisis_tag = True
1✔
166

167
    def add_tags_to_set(self, tagset):
1✔
168
        tags = self.dataset.get_tags()
1✔
169
        tagset.update(tags)
1✔
170

171
    def get_updated_by_script(self):
1✔
172
        updated_by_script = self.get("updated_by_script")
1✔
173
        self.updated_by_script = None
1✔
174
        self.updated_by_noncod_script = "N"
1✔
175
        self.updated_by_cod_script = "N"
1✔
176
        self.old_updated_by_noncod_script = "N"
1✔
177
        self.old_updated_by_cod_script = "N"
1✔
178
        self.outdated_lastmodified = "N"
1✔
179
        if not updated_by_script:
1✔
180
            return
1✔
181
        if self.exclude_from_stats == "Y":
1✔
182
            return
1✔
183
        if "HDXINTERNAL" in updated_by_script:
1✔
184
            if any(x in updated_by_script for x in ("tagbot",)):
1✔
185
                return
1✔
186
        if any(
1✔
187
            x in updated_by_script
188
            for x in (
189
                "HDXPythonLibrary/5.5.6-test (2022-03-15",
190
                "HDXPythonLibrary/5.4.8-test (2022-01-04",
191
                "HDXPythonLibrary/5.4.1-test (2021-11-17",
192
            )
193
        ):  # Mike maintainer bulk change
194
            return
1✔
195
        match = self.bracketed_date.search(updated_by_script)
1✔
196
        if match is None:
1✔
197
            return
×
198
        else:
199
            try:
1✔
200
                self.updated_by_script = parse_date(
1✔
201
                    match.group(1), include_microseconds=True
202
                )
203
            except ParserError:
×
204
                return
×
205
        if "HDXINTERNAL" in updated_by_script and "CODs" in updated_by_script:
1✔
206
            if "cod_level" in self.data:
1✔
207
                self.updated_by_cod_script = "Y"
1✔
208
            else:
209
                # no longer updated by COD script
210
                self.old_updated_by_cod_script = "Y"
1✔
211
            return
1✔
212

213
        if self.last_modified:
1✔
214
            if self.updated_by_script > self.last_modified:
1✔
215
                self.updated_by_noncod_script = "Y"
1✔
216
                update_frequency = self.dataset.get_expected_update_frequency()
1✔
217
                if update_frequency != "Live":
1✔
218
                    difference = self.updated_by_script - self.last_modified
1✔
219
                    if difference > timedelta(hours=1):
1✔
220
                        self.outdated_lastmodified = "Y"
1✔
221
                return
1✔
222
            difference = self.last_modified - self.updated_by_script
1✔
223
            if difference < timedelta(hours=1):
1✔
224
                self.updated_by_noncod_script = "Y"
1✔
225
            else:
226
                self.old_updated_by_noncod_script = "Y"
1✔
227

228
    def calculate_lm_freshness(
1✔
229
        self, last_modified: datetime, update_frequency: int
230
    ) -> str:
231
        """Calculate freshness based on a last modified date and the expected update
232
        frequency. Returns "Fresh", "Due", "Overdue" or "Delinquent".
233

234
        Args:
235
            last_modified (datetime): Last modified date
236
            update_frequency (int): Expected update frequency
237

238
        Returns:
239
            str: "Fresh", "Due", "Overdue" or "Delinquent"
240
        """
241
        delta = self.today - last_modified
1✔
242
        if delta >= self.last_modified_aging[update_frequency]["Delinquent"]:
1✔
243
            return "Delinquent"
1✔
244
        elif delta >= self.last_modified_aging[update_frequency]["Overdue"]:
1✔
245
            return "Overdue"
1✔
246
        elif delta >= self.last_modified_aging[update_frequency]["Due"]:
1✔
247
            return "Due"
1✔
248
        return "Fresh"
1✔
249

250
    def get_last_modified_freshness(self):
1✔
251
        self.last_modified_fresh = ""
1✔
252
        if self.exclude_from_stats == "Y":
1✔
253
            return
1✔
254
        if not self.last_modified:
1✔
255
            return
1✔
256
        review_date = self.get("review_date")
1✔
257
        if review_date is None:
1✔
258
            latest_of_modifieds = self.last_modified
1✔
259
        else:
260
            review_date = parse_date(review_date, include_microseconds=True)
×
261
            if review_date > self.last_modified:
×
262
                latest_of_modifieds = review_date
×
263
            else:
264
                latest_of_modifieds = self.last_modified
×
265
        if self.updated_by_script and self.updated_by_script > latest_of_modifieds:
1✔
266
            latest_of_modifieds = self.updated_by_script
1✔
267
        if self.update_frequency:
1✔
268
            update_frequency = int(self.update_frequency)
1✔
269
            if update_frequency == 0:
1✔
270
                self.last_modified_fresh = "Fresh"
1✔
271
            elif update_frequency == -1:
1✔
272
                self.last_modified_fresh = "Fresh"
1✔
273
            elif update_frequency == -2:
1✔
274
                self.last_modified_fresh = "Fresh"
1✔
275
            else:
276
                self.last_modified_fresh = self.calculate_lm_freshness(
1✔
277
                    latest_of_modifieds, update_frequency
278
                )
279

280
    def calculate_ed_uptodate(self, end_date: datetime, update_frequency: int) -> str:
1✔
281
        """Calculate up to date based on time period end date and the expected
282
        update frequency. Returns "UpToDate" or "OutOfDate".
283

284
        Args:
285
            last_modified (datetime): Last modified date
286
            update_frequency (int): Expected update frequency
287

288
        Returns:
289
            str: "UpToDate" or "OutOfDate"
290
        """
291
        delta = self.today - end_date
1✔
292
        if delta >= self.end_date_aging[update_frequency]["OutOfDate"]:
1✔
293
            return "OutOfDate"
1✔
294
        return "UpToDate"
1✔
295

296
    def get_end_date_freshness(self):
1✔
297
        self.end_date_uptodate = ""
1✔
298
        if self.exclude_from_stats == "Y":
1✔
299
            return
1✔
300
        if self.update_frequency:
1✔
301
            update_frequency = int(self.update_frequency)
1✔
302
            if update_frequency < 0:
1✔
303
                return
1✔
304
            if update_frequency == 0:
1✔
305
                self.end_date_uptodate = "UpToDate"
1✔
306
            elif update_frequency > 0:
1✔
307
                if self.enddate == "ongoing":
1✔
308
                    self.end_date_uptodate = "UpToDate"
1✔
309
                    return
1✔
310
                enddate = parse_date(self.enddate)
1✔
311
                self.end_date_uptodate = self.calculate_ed_uptodate(
1✔
312
                    enddate, update_frequency
313
                )
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc