• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

pulibrary / pymarc_dedupe / 62df4d4f-cdae-45a9-a6bd-0ac11c6e1770

22 May 2025 07:05PM UTC coverage: 99.158% (-0.8%) from 100.0%
62df4d4f-cdae-45a9-a6bd-0ac11c6e1770

Pull #24

circleci

maxkadel
Add output of comparison experiment - uses data set from Mark Z
Pull Request #24: Green locally - connect to Postgres DB

264 of 271 new or added lines in 10 files covered. (97.42%)

4 existing lines in 1 file now uncovered.

824 of 831 relevant lines covered (99.16%)

0.99 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

98.31
/src/marc_record.py
1
import string
1✔
2
import re
1✔
3
import pymarc
1✔
4
from src.gold_rush import GoldRush
1✔
5

6

7
class MarcRecord:
1✔
8
    def __init__(self, record):
1✔
9
        self.record = record
1✔
10

11
    def to_dictionary(self):
1✔
12
        """Takes a MarcRecord and returns a dictionary including the most salient fields"""
13
        return {
1✔
14
            "id": self.id(),
15
            "title": self.title(),
16
            "transliterated_title": self.transliterated_title(),
17
            "publication_year": self.publication_year() or "",
18
            "pagination": self.pagination(),
19
            "edition": self.edition(),
20
            "publisher_name": self.publisher_name(),
21
            "type_of": self.type_of(),
22
            "title_part": self.title_part(),
23
            "title_number": self.title_number(),
24
            "author": self.author(),
25
            "title_inclusive_dates": self.title_inclusive_dates(),
26
            "gov_doc_number": self.gov_doc_number(),
27
            "is_electronic_resource": self.is_electronic_resource(),
28
            "gold_rush": GoldRush(self).as_gold_rush(),
29
        }
30

31
    def id(self):
1✔
32
        try:
1✔
33
            return self.record.get("001").data
1✔
34
        except (KeyError, AttributeError):
1✔
35
            return ""
1✔
36

37
    def title(self):
1✔
38
        if self.__vernacular_title_field():
1✔
39
            title_field = self.__vernacular_title_field()
1✔
40
        else:
41
            title_field = self.__title_from_245()
1✔
42
        try:
1✔
43
            subfield_a = str(title_field.get("a") or "")
1✔
44
            subfield_b = str(title_field.get("b") or "")
1✔
45
            subfield_p = str(title_field.get("p") or "")
1✔
46
            title = " ".join([subfield_a, subfield_b, subfield_p])
1✔
47
            title = self.__strip_ending_punctuation(title)
1✔
48
            return title
1✔
49
        except (KeyError, AttributeError):
1✔
50
            return ""
1✔
51

52
    def transliterated_title(self):
1✔
53
        title_field = self.__title_from_245()
1✔
54
        try:
1✔
55
            subfield_a = str(title_field.get("a") or "")
1✔
56
            subfield_b = str(title_field.get("b") or "")
1✔
57
            subfield_p = str(title_field.get("p") or "")
1✔
58
            title = " ".join([subfield_a, subfield_b, subfield_p])
1✔
59
            title = self.__strip_ending_punctuation(title)
1✔
60
            return title
1✔
61
        except (KeyError, AttributeError):
1✔
62
            return ""
1✔
63

64
    def __title_from_245(self):
1✔
65
        try:
1✔
66
            title = self.record["245"]
1✔
67
            return title
1✔
68
        except KeyError:
1✔
69
            return ""
1✔
70

71
    def __vernacular_title_field(self):
1✔
72
        try:
1✔
73
            return self.record.get_linked_fields(self.record["245"])[0]
1✔
74
        except (KeyError, IndexError, pymarc.exceptions.MissingLinkedFields):
1✔
75
            return ""
1✔
76

77
    def publication_year(self):
1✔
78
        pub_year = None
1✔
79
        if self.date_one() and self.date_two():
1✔
80
            pub_year = self.date_two()
1✔
81
        elif self.date_one() and not self.date_two():
1✔
82
            pub_year = self.date_one()
1✔
83
        elif (
1✔
84
            not self.date_one() and not self.date_two() and self.__date_of_production()
85
        ):
86
            pub_year = self.__date_of_production()
1✔
87
        elif (
1✔
88
            not self.date_one()
89
            and not self.date_two()
90
            and not self.__date_of_production()
91
            and self.__date_of_publication()
92
        ):
93
            pub_year = self.__date_of_publication()
1✔
94
        return pub_year
1✔
95

96
    def pagination(self):
1✔
97
        try:
1✔
98
            subfield_a = self.record["300"].get("a")
1✔
99
            if subfield_a:
1✔
100
                return self.__normalize_extent(subfield_a)
1✔
101
            return ""
1✔
102
        except KeyError:
1✔
103
            return ""
1✔
104

105
    def edition(self):
1✔
106
        try:
1✔
107
            return self.__normalize_edition(self.record["250"].get("a"))
1✔
108
        except KeyError:
1✔
109
            return ""
1✔
110

111
    def publisher_name(self):
1✔
112
        try:
1✔
113
            pub = self.record["264"]["b"]
1✔
114
        except KeyError:
1✔
115
            try:
1✔
116
                pub = self.record["260"]["b"]
1✔
117
            except KeyError:
1✔
118
                return ""
1✔
119
        return self.__strip_punctuation(pub)
1✔
120

121
    def type_of(self):
1✔
122
        return self.record.leader.type_of_record
1✔
123

124
    def title_part(self):
1✔
125
        try:
1✔
126
            parts = self.record["245"].get_subfields("p")[1:]
1✔
127
            return self.__strip_punctuation(" ".join(parts))
1✔
128
        except KeyError:
1✔
129
            return ""
1✔
130

131
    def title_number(self):
1✔
132
        try:
1✔
133
            num = self.record["245"].get("n")
1✔
134
            if num:
1✔
135
                return self.__strip_punctuation(num)
1✔
136
            return ""
1✔
137
        except KeyError:
1✔
138
            return ""
1✔
139

140
    def author(self):
1✔
141
        if self.__vernacular_author_field():
1✔
142
            author_field = self.__vernacular_author_field()
1✔
143
        else:
144
            author_field = self.__author_from_1xx()
1✔
145

146
        if author_field:
1✔
147
            try:
1✔
148
                return self.__strip_ending_punctuation(author_field.get("a"))
1✔
NEW
UNCOV
149
            except AttributeError:
×
NEW
UNCOV
150
                return ""
×
151
        return ""
1✔
152

153
    def __author_from_1xx(self):
1✔
154
        try:
1✔
155
            return self.record["100"]
1✔
156
        except KeyError:
1✔
157
            try:
1✔
158
                return self.record["110"]
1✔
159
            except KeyError:
1✔
160
                try:
1✔
161
                    return self.record["111"]
1✔
162
                except KeyError:
1✔
163
                    return ""
1✔
164

165
    def __vernacular_author_field(self):
1✔
166
        try:
1✔
167
            return self.record.get_linked_fields(self.record["100"])[0]
1✔
168
        except (KeyError, IndexError, pymarc.exceptions.MissingLinkedFields):
1✔
169
            try:
1✔
170
                return self.record.get_linked_fields(self.record["110"])[0]
1✔
171
            except (KeyError, IndexError, pymarc.exceptions.MissingLinkedFields):
1✔
172
                try:
1✔
173
                    return self.record.get_linked_fields(self.record["111"])[0]
1✔
174
                except (KeyError, IndexError, pymarc.exceptions.MissingLinkedFields):
1✔
175
                    return ""
1✔
176

177
    def title_inclusive_dates(self):
1✔
178
        try:
1✔
179
            date = self.record["245"].get("f")
1✔
180
            if date:
1✔
181
                return self.__strip_ending_punctuation(date)
1✔
182
            return ""
1✔
183
        except KeyError:
1✔
184
            return ""
1✔
185

186
    def gov_doc_number(self):
1✔
187
        try:
1✔
188
            return self.record["086"].get("a")
1✔
189
        except KeyError:
1✔
190
            return ""
1✔
191

192
    def is_electronic_resource(self):
1✔
193
        return bool(
1✔
194
            self.__is_electronic_resource_from_title()
195
            or self.__is_electronic_resource_from_reproduction()
196
            or self.__is_electronic_resource_from_description()
197
            or self.__is_electronic_resource_from_007()
198
        )
199

200
    def __is_electronic_resource_from_title(self):
1✔
201
        try:
1✔
202
            return self.record["245"].get("h") == "[electronic resource]"
1✔
203
        except KeyError:
1✔
204
            return False
1✔
205

206
    def __is_electronic_resource_from_reproduction(self):
1✔
207
        try:
1✔
208
            return re.match(
1✔
209
                "electronic reproduction", self.record["533"].get("a"), re.IGNORECASE
210
            )
211
        except (KeyError, TypeError):
1✔
212
            return False
1✔
213

214
    def __is_electronic_resource_from_description(self):
1✔
215
        try:
1✔
216
            subfield_a = self.record["300"].get("a")
1✔
217
            if subfield_a:
1✔
218
                return bool(re.search("online resource", subfield_a, re.IGNORECASE))
1✔
219
            return False
1✔
220
        except KeyError:
1✔
221
            return False
1✔
222

223
    def __is_electronic_resource_from_007(self):
1✔
224
        try:
1✔
225
            return bool(self.record["007"].data[0] == "c")
1✔
226
        except KeyError:
1✔
227
            return False
1✔
228

229
    def __normalize_edition(self, edition):
1✔
230
        edition_mapping = {"Ed.": "Edition", "ed.": "edition"}
1✔
231
        try:
1✔
232
            for key, value in edition_mapping.items():
1✔
233
                edition = re.sub(key, value, edition)
1✔
234
            return self.__strip_punctuation(edition)
1✔
NEW
UNCOV
235
        except TypeError:
×
NEW
UNCOV
236
            return ""
×
237

238
    def __normalize_extent(self, extent):
1✔
239
        extent_mapping = {
1✔
240
            r"p\.": "pages",
241
            r"v\.": "volumes",
242
            r"vol\.": "volumes",
243
            r"ℓ\.": "leaves",
244
        }
245
        for key, value in extent_mapping.items():
1✔
246
            extent = re.sub(key, value, extent)
1✔
247
        return self.__strip_punctuation(extent)
1✔
248

249
    def __strip_ending_punctuation(self, some_string):
1✔
250
        punctuation_to_strip = string.punctuation.replace(")", "")
1✔
251
        return some_string.strip(punctuation_to_strip + " ")
1✔
252

253
    def __strip_punctuation(self, some_string):
1✔
254
        punctuation_to_strip = string.punctuation.replace("&", "")
1✔
255
        some_string = some_string.translate(str.maketrans("", "", punctuation_to_strip))
1✔
256
        some_string = re.sub("  ", " ", some_string).strip()
1✔
257
        return some_string
1✔
258

259
    def is_valid_date(self, date_string):
1✔
260
        valid = True
1✔
261
        if date_string == "9999":
1✔
262
            valid = False
1✔
263
        elif date_string == "    ":
1✔
264
            valid = False
1✔
265
        elif self.number_of_characters(date_string) != 4:
1✔
266
            valid = False
1✔
267
        try:
1✔
268
            int(date_string)
1✔
269
        except (ValueError, TypeError):
1✔
270
            valid = False
1✔
271
        return valid
1✔
272

273
    def number_of_characters(self, date_string):
1✔
274
        try:
1✔
275
            return len(date_string)
1✔
276
        except TypeError:
1✔
277
            return False
1✔
278

279
    def date_one(self):
1✔
280
        try:
1✔
281
            date_string = self.record["008"].data[7:11]
1✔
282
            return self.__as_date(date_string)
1✔
283
        except KeyError:
1✔
284
            return None
1✔
285

286
    def date_two(self):
1✔
287
        try:
1✔
288
            date_string = self.record["008"].data[11:15]
1✔
289
            return self.__as_date(date_string)
1✔
290
        except KeyError:
1✔
291
            return None
1✔
292

293
    def __date_of_production(self):
1✔
294
        try:
1✔
295
            date_string = self.record["264"]["c"]
1✔
296
        except KeyError:
1✔
297
            return None
1✔
298
        return self.__as_date(date_string)
1✔
299

300
    def __date_of_publication(self):
1✔
301
        try:
1✔
302
            date_string = self.record["260"]["c"]
1✔
303
        except KeyError:
1✔
304
            return ""
1✔
305
        return self.__as_date(date_string)
1✔
306

307
    def __as_date(self, date_string):
1✔
308
        # Remove punctuation (for 260 and 264 fields)
309
        date_string = self.__strip_punctuation(date_string)
1✔
310
        if self.is_valid_date(date_string):
1✔
311
            return int(date_string)
1✔
312
        return ""
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc