• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

OCHA-DAP / hdx-scraper-unesco / 14918015558

08 May 2025 11:24PM UTC coverage: 86.139% (+0.4%) from 85.714%
14918015558

Pull #5

github

web-flow
Merge f6656dba3 into 06a72e8b8
Pull Request #5: HDXDSYS-2109 Update UNESCO pipeline to use latest data

16 of 17 new or added lines in 1 file covered. (94.12%)

174 of 202 relevant lines covered (86.14%)

0.86 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

86.14
/src/hdx/scraper/unesco/unesco.py
1
#!/usr/bin/python
2
"""
3
UNESCO:
4
------
5

6
Reads UNESCO bulk files and creates datasets.
7

8
"""
9

10
import logging
1✔
11
import re
1✔
12
from os import remove, rename
1✔
13
from os.path import exists, join, split
1✔
14
from shutil import copyfileobj
1✔
15
from urllib.request import urlretrieve
1✔
16
from zipfile import ZipFile
1✔
17

18
from slugify import slugify
1✔
19

20
from hdx.data.dataset import Dataset
1✔
21
from hdx.data.hdxobject import HDXError
1✔
22
from hdx.data.showcase import Showcase
1✔
23
from hdx.location.country import Country
1✔
24
from hdx.utilities.dateparse import default_date, default_enddate, parse_date_range
1✔
25
from hdx.utilities.dictandlist import dict_of_lists_add, dict_of_sets_add
1✔
26

27
logger = logging.getLogger(__name__)
1✔
28

29
hxltags = {
1✔
30
    "indicator_id": "#indicator+code",
31
    "indicator_label_en": "#indicator+name",
32
    "country_id": "#country+code",
33
    "year": "#date+year",
34
    "value": "#indicator+value+num",
35
    "type": "#description+type",
36
    "metadata": "#description",
37
}
38

39

40
def download_indicatorsets(
1✔
41
    base_url, folder, indicatorsetcodes, urlretrieve=urlretrieve
42
):
43
    indicatorsets = dict()
1✔
44
    for indicatorsetcode in indicatorsetcodes:
1✔
45
        filename = f"{indicatorsetcode}.zip"
1✔
46
        path = join(folder, filename)
1✔
47
        statusfile = join(folder, f"{indicatorsetcode}.txt")
1✔
48
        if exists(path):
1✔
49
            if exists(statusfile):
×
50
                with open(statusfile) as f:
×
51
                    status = f.read()
×
52
                    if status == "OK":
×
53
                        indicatorsets[indicatorsetcode] = path
×
54
                        continue
×
55
                remove(statusfile)
×
56
            remove(path)
×
57
        url = f"{base_url}{filename}"
1✔
58
        path, headers = urlretrieve(url, path)
1✔
59
        if "zip" not in headers.get_content_type():
1✔
60
            raise OSError(f"Problem with {path}!")
×
61
        with open(statusfile, "w") as f:
1✔
62
            f.write("OK")
1✔
63
            indicatorsets[indicatorsetcode] = path
1✔
64
    return indicatorsets
1✔
65

66

67
def get_filepath(zipfile, inputfile, outputfolder, indicatorsetcode):
1✔
68
    folder, filename = split(inputfile)
1✔
69
    origfolder = join(outputfolder, indicatorsetcode)
1✔
70
    if folder:
1✔
71
        zipfolder = outputfolder
×
72
    else:
73
        zipfolder = origfolder
1✔
74
    inputpath = zipfile.extract(inputfile, path=zipfolder)
1✔
75
    origpath = join(origfolder, f"orig_{filename}")
1✔
76

77
    rename(inputpath, origpath)
1✔
78
    with open(origpath, mode="rt") as inputfp:
1✔
79
        line = inputfp.readline().lower()
1✔
80
        with open(inputpath, "w") as outputfp:
1✔
81
            outputfp.write(line)
1✔
82
            copyfileobj(inputfp, outputfp)
1✔
83
    return inputpath
1✔
84

85

86
def get_countriesdata(indicatorsets, downloader, folder):
1✔
87
    indheaders = None
1✔
88
    countriesset = set()
1✔
89
    datafiles = dict()
1✔
90
    indicatorsetsdates = dict()
1✔
91
    indicatorsetsindicators = dict()
1✔
92
    for indicatorsetcode in indicatorsets:
1✔
93
        path = indicatorsets[indicatorsetcode]
1✔
94
        indfile = None
1✔
95
        cntfile = None
1✔
96
        metadatafile = None
1✔
97
        datafile = None
1✔
98
        with ZipFile(path, "r") as zipfile:
1✔
99
            for filename in zipfile.namelist():
1✔
100
                if "README" in filename:
1✔
101
                    fuzzy = dict()
1✔
102
                    parse_date_range(filename.replace("_", " "), fuzzy=fuzzy)
1✔
103
                    indicatorsetsdates[indicatorsetcode] = "".join(fuzzy["date"])
1✔
104
                if "LABEL" in filename:
1✔
105
                    indfile = filename
1✔
106
                if "COUNTRY" in filename:
1✔
107
                    cntfile = filename
1✔
108
                if "METADATA" in filename:
1✔
109
                    metadatafile = filename
1✔
110
                if "DATA_NATIONAL" in filename:
1✔
111
                    datafile = filename
1✔
112
            if datafile is None:
1✔
113
                raise (OSError("No data file in zip!"))
×
114
            if indfile is None:
1✔
115
                raise (OSError("No indicator file in zip!"))
×
116
            if cntfile is None:
1✔
117
                raise (OSError("No country file in zip!"))
×
118
            indpath = get_filepath(zipfile, indfile, folder, indicatorsetcode)
1✔
119
            indheaders, iterator = downloader.get_tabular_rows(
1✔
120
                indpath,
121
                headers=1,
122
                dict_form=True,
123
                format="csv",
124
                encoding="WINDOWS-1252",
125
            )
126
            indicatorsetindicators = indicatorsetsindicators.get(
1✔
127
                indicatorsetcode, dict()
128
            )
129
            for row in iterator:
1✔
130
                dict_of_lists_add(indicatorsetindicators, "rows", row)
1✔
131
                indicator_name = row["indicator_label_en"]
1✔
132
                ind0 = re.sub(r"\s+", " ", indicator_name)
1✔
133
                ind1, _, _ = ind0.partition(",")
1✔
134
                ind2, _, _ = ind1.partition("(")
1✔
135
                indicator_name, _, _ = ind2.partition(":")
1✔
136
                dict_of_sets_add(
1✔
137
                    indicatorsetindicators, "shortnames", indicator_name.strip()
138
                )
139
            indicatorsetsindicators[indicatorsetcode] = indicatorsetindicators
1✔
140

141
            cntpath = get_filepath(zipfile, cntfile, folder, indicatorsetcode)
1✔
142
            _, iterator = downloader.get_tabular_rows(
1✔
143
                cntpath, headers=1, dict_form=True, format="csv"
144
            )
145
            for row in iterator:
1✔
146
                countriesset.add(row["country_id"])
1✔
147

148
            if metadatafile:
1✔
149
                metadatapath = get_filepath(
1✔
150
                    zipfile, metadatafile, folder, indicatorsetcode
151
                )
152
            else:
153
                metadatapath = None
×
154
            datapath = get_filepath(zipfile, datafile, folder, indicatorsetcode)
1✔
155
            datafiles[indicatorsetcode] = (metadatapath, datapath)
1✔
156
    countries = list()
1✔
157
    for countryiso in sorted(list(countriesset)):
1✔
158
        iso2 = Country.get_iso2_from_iso3(countryiso)
1✔
159
        countryname = Country.get_country_name_from_iso3(countryiso)
1✔
160
        if iso2 is None or countryname is None:
1✔
161
            continue
1✔
162
        countries.append({"iso3": countryiso, "iso2": iso2, "countryname": countryname})
1✔
163
    return countries, indheaders, indicatorsetsindicators, indicatorsetsdates, datafiles
1✔
164

165

166
def generate_dataset_and_showcase(
1✔
167
    indicatorsetcodes,
168
    indheaders,
169
    indicatorsetsindicators,
170
    indicatorsetsdates,
171
    country,
172
    datafiles,
173
    downloader,
174
    folder,
175
):
176
    countryiso = country["iso3"]
1✔
177
    countryname = country["countryname"]
1✔
178
    title = f"{countryname} - Education Indicators"
1✔
179
    slugified_name = slugify(f"UNESCO data for {countryname}").lower()
1✔
180
    logger.info(f"Creating dataset: {title}")
1✔
181
    dataset = Dataset({"name": slugified_name, "title": title})
1✔
182

183
    dataset.set_maintainer("a5c5296a-3206-4e51-b2de-bfe34857185f")
1✔
184
    dataset.set_organization("18f2d467-dcf8-4b7e-bffa-b3c338ba3a7c")
1✔
185
    dataset.set_expected_update_frequency("Never")
1✔
186
    dataset.set_subnational(False)
1✔
187
    try:
1✔
188
        dataset.add_country_location(countryiso)
1✔
189
    except HDXError as e:
×
190
        logger.exception(f"{countryname} has a problem! {e}")
×
191
        return None, None, None
×
192
    tags = [
1✔
193
        "sustainable development",
194
        "demographics",
195
        "socioeconomics",
196
        "education",
197
        "indicators",
198
        "sustainable development goals-sdg",
199
        "hxl",
200
    ]
201
    dataset.add_tags(tags)
1✔
202

203
    earliest_start_date = default_enddate
1✔
204
    latest_end_date = default_date
1✔
205

206
    def process_row(headers, row):
1✔
207
        nonlocal earliest_start_date, latest_end_date
208
        if row["country_id"] != countryiso:
1✔
NEW
209
            return None
×
210
        year = row["year"]
1✔
211
        if year:
1✔
212
            startdate, enddate = parse_date_range(
1✔
213
                year,
214
                zero_time=True,
215
                max_endtime=True,
216
            )
217
            if startdate < earliest_start_date:
1✔
218
                earliest_start_date = startdate
1✔
219
            if enddate > latest_end_date:
1✔
220
                latest_end_date = enddate
1✔
221
        return row
1✔
222

223
    def process_metadata_row(headers, row):
1✔
224
        if row["country_id"] == countryiso:
1✔
225
            return row
1✔
226
        else:
227
            return None
×
228

229
    categories = list()
1✔
230
    bites_disabled = None
1✔
231
    qc_indicators = None
1✔
232

233
    for indicatorsetcode in indicatorsetcodes:
1✔
234
        indicatorsetname = indicatorsetcodes[indicatorsetcode]["title"]
1✔
235
        metadatafile, datafile = datafiles[indicatorsetcode]
1✔
236
        indicatorsetindicators = indicatorsetsindicators[indicatorsetcode]
1✔
237
        indicator_names = indicatorsetindicators["shortnames"]
1✔
238
        filename = f"{indicatorsetcode}_data_{countryiso}.csv"
1✔
239
        resourcename = f"{indicatorsetname} data"
1✔
240
        resourcedata = {
1✔
241
            "name": resourcename,
242
            "description": f"{indicatorsetname} data with HXL tags.\n\nIndicators: {', '.join(sorted(indicator_names))}",
243
        }
244
        indicators_for_qc = indicatorsetcodes[indicatorsetcode].get("quickcharts")
1✔
245
        if indicators_for_qc:
1✔
246
            values = [x["code"] for x in indicators_for_qc]
1✔
247
            quickcharts = {
1✔
248
                "hashtag": "#indicator+code",
249
                "values": values,
250
                "numeric_hashtag": "#indicator+value+num",
251
                "cutdown": 2,
252
                "cutdownhashtags": ["#indicator+code", "#country+code", "#date+year"],
253
            }
254
            qc_indicators = indicators_for_qc
1✔
255
        else:
256
            quickcharts = None
×
257
        outputfolder = join(folder, indicatorsetcode)
1✔
258
        success, results = dataset.download_and_generate_resource(
1✔
259
            downloader,
260
            datafile,
261
            hxltags,
262
            outputfolder,
263
            filename,
264
            resourcedata,
265
            row_function=process_row,
266
            quickcharts=quickcharts,
267
        )
268
        if success is False:
1✔
269
            logger.warning(f"{resourcename} for {countryname} has no data!")
×
270
            continue
×
271
        disabled_bites = results.get("bites_disabled")
1✔
272
        if disabled_bites:
1✔
273
            bites_disabled = disabled_bites
1✔
274
        filename = f"{indicatorsetcode}_indicatorlist_{countryiso}.csv"
1✔
275
        resourcename = f"{indicatorsetname} indicator list"
1✔
276
        resourcedata = {
1✔
277
            "name": resourcename,
278
            "description": f"{indicatorsetname} indicator list with HXL tags",
279
        }
280
        indicators = indicatorsetindicators["rows"]
1✔
281
        success, _ = dataset.generate_resource_from_iterable(
1✔
282
            indheaders, indicators, hxltags, outputfolder, filename, resourcedata
283
        )
284
        if success is False:
1✔
285
            logger.warning(f"{resourcename} for {countryname} has no data!")
×
286
            continue
×
287
        categories.append(
1✔
288
            f"{indicatorsetname} (made {indicatorsetsdates[indicatorsetcode]})"
289
        )
290
        if metadatafile:
1✔
291
            filename = f"{indicatorsetcode}_metadata_{countryiso}.csv"
1✔
292
            resourcename = f"{indicatorsetname} metadata"
1✔
293
            resourcedata = {
1✔
294
                "name": resourcename,
295
                "description": f"{indicatorsetname} metadata with HXL tags",
296
            }
297
            success, results = dataset.download_and_generate_resource(
1✔
298
                downloader,
299
                metadatafile,
300
                hxltags,
301
                outputfolder,
302
                filename,
303
                resourcedata,
304
                row_function=process_metadata_row,
305
            )
306
            if success is False:
1✔
307
                logger.warning(f"{resourcename} for {countryname} has no data!")
×
308
                continue
×
309
    if dataset.number_of_resources() == 0:
1✔
310
        logger.warning(f"{countryname} has no data!")
×
311
        return None, None, None, None
×
312
    dataset.set_time_period(earliest_start_date, latest_end_date)
1✔
313
    dataset.quickcharts_resource_last()
1✔
314
    notes = [
1✔
315
        f"Education indicators for {countryname}.\n\n",
316
        "Contains data from the UNESCO Institute for Statistics [bulk data service](http://data.uis.unesco.org) ",
317
        f"covering the following categories: {', '.join(categories)}",
318
    ]
319
    dataset["notes"] = "".join(notes)
1✔
320

321
    showcase = Showcase(
1✔
322
        {
323
            "name": f"{slugified_name}-showcase",
324
            "title": title,
325
            "notes": f"Education indicators for {countryname}",
326
            "url": f"https://uis.unesco.org/en/country/{country['iso2']}",
327
            "image_url": "https://tcg.uis.unesco.org/wp-content/uploads/sites/4/2021/09/combined_uis_colors_eng-002-300x240.png",
328
        }
329
    )
330
    showcase.add_tags(tags)
1✔
331

332
    return dataset, showcase, bites_disabled, qc_indicators
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc