• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

pulibrary / orangetheses / 38778a1e-2dd3-4ec3-b4d6-a6c4d301682a

07 Dec 2023 05:02PM UTC coverage: 87.536% (+0.6%) from 86.964%
38778a1e-2dd3-4ec3-b4d6-a6c4d301682a

push

circleci

web-flow
Merge pull request #79 from pulibrary/i77-jrgriffiniii-date-display

Updates Rubocop, refactors DataSpace Solr Document generation, and ensures that invalid embargo dates trigger log warnings

190 of 196 new or added lines in 5 files covered. (96.94%)

12 existing lines in 1 file now uncovered.

604 of 690 relevant lines covered (87.54%)

18.31 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

87.28
/lib/orangetheses/indexer.rb
1
# frozen_string_literal: true
2

3
require 'rsolr'
1✔
4
require 'rexml/document'
1✔
5
require 'chronic'
1✔
6
require 'logger'
1✔
7
require 'json'
1✔
8
require 'iso-639'
1✔
9
require 'yaml'
1✔
10
require 'erb'
1✔
11

12
module Orangetheses
1✔
13
  class Indexer
1✔
14
    SET = 'Princeton University Senior Theses'
1✔
15

16
    NON_SPECIAL_ELEMENT_MAPPING = {
17
      'creator' => %w[author_display author_s],
1✔
18
      'contributor' => %w[advisor_display author_s],
19
      'format' => ['description_display'],
20
      'rights' => ['rights_reproductions_note_display'],
21
      'description' => ['summary_note_display']
22
    }.freeze
23

24
    REST_NON_SPECIAL_ELEMENT_MAPPING = {
25
      'dc.contributor.author' => %w[author_display author_s],
1✔
26
      'dc.contributor.advisor' => %w[advisor_display author_s],
27
      'dc.contributor' => %w[contributor_display author_s],
28
      'pu.department' => %w[department_display author_s],
29
      'pu.certificate' => %w[certificate_display author_s],
30
      'dc.format.extent' => ['description_display'],
31
      'dc.description.abstract' => ['summary_note_display']
32
    }.freeze
33

34
    HARD_CODED_TO_ADD = {
1✔
35
      'format' => 'Senior thesis'
36
    }.freeze
37

38
    # @todo This needs to be refactored into a separate Class
39
    def self.config_file
1✔
40
      File.join(File.dirname(__FILE__), '..', '..', 'config', 'solr.yml')
74✔
41
    end
42

43
    def self.config_yaml
1✔
44
      ERB.new(IO.read(config_file)).result(binding)
74✔
45
    rescue StandardError, SyntaxError => e
NEW
46
      raise("#{config_file} was found, but could not be parsed with ERB. \n#{e.inspect}")
×
47
    end
48

49
    def self.config_values
1✔
50
      YAML.safe_load(config_yaml)
74✔
51
    end
52

53
    def self.env
1✔
54
      ENV['ORANGETHESES_ENV'] || 'development'
74✔
55
    end
56

57
    def self.config
1✔
58
      OpenStruct.new(solr: config_values[env])
74✔
59
    end
60

61
    def self.default_solr_url
1✔
62
      config.solr['url']
74✔
63
    end
64

65
    def initialize(solr_server = nil)
1✔
66
      solr_server ||= self.class.default_solr_url
74✔
67

68
      @solr = RSolr.connect(url: solr_server)
74✔
69
      @logger = Logger.new($stdout)
74✔
70
      @logger.level = Logger::INFO
74✔
71
      @logger.formatter = proc do |severity, datetime, _progname, msg|
74✔
72
        time = datetime.strftime('%H:%M:%S')
3✔
73
        "[#{time}] #{severity}: #{msg}\n"
3✔
74
      end
75
    end
76

77
    # @param element  A REXML::Element (because this is what we get from the OAI gem)
78
    # @return  The HTTP response status from Solr (??)
79
    def index(metadata_element)
1✔
80
      dc_elements = pull_dc_elements(metadata_element)
×
81
      doc = build_hash(dc_elements)
×
82
      @logger.info("Adding #{doc['id']}")
×
83
      @solr.add(doc, add_attributes: { commitWithin: 10 })
×
84
    rescue NoMethodError => e
85
      @logger.error(e.to_s)
×
86
      @logger.error(metadata_element)
×
87
    rescue StandardError => e
88
      @logger.error(e.to_s)
×
89
      dc_elements.each { |element| @logger.error(element.to_s) }
×
90
    end
91

92
    # Constructs DataspaceDocument objects from a Hash of attributes
93
    # @returns [DataspaceDocument]
94
    def build_solr_document(**values)
1✔
95
      id = values['id']
10✔
96

97
      title = values['dc.title']
10✔
98
      title_t = title_search_hash(title)
10✔
99
      title_citation_display = first_or_nil(title)
10✔
100
      title_display = title_citation_display
10✔
101
      title_sort = title_sort_hash(title)
10✔
102

103
      author = values['dc.contributor.author']
10✔
104
      author_sort = first_or_nil(author)
10✔
105

106
      electronic_access_1display = ark_hash(values)
10✔
107

108
      identifier_other = values['dc.identifier.other']
10✔
109
      call_number_display = call_number(identifier_other)
10✔
110
      call_number_browse_s = call_number_display
10✔
111

112
      language_iso = values['dc.language.iso']
10✔
113
      language_facet = code_to_language(language_iso)
10✔
114
      language_name_display = language_facet
10✔
115

116
      embargo_lift = values['pu.embargo.lift']
10✔
117
      embargo_terms = values['pu.embargo.terms']
10✔
118
      walkin = values['pu.mudd.walkin']
10✔
119
      location = values['pu.location']
10✔
120
      access_rights = values['dc.rights.accessRights']
10✔
121

122
      attrs = {
123
        'id' => id,
10✔
124
        'title_t' => title_t,
125
        'title_citation_display' => title_citation_display,
126
        'title_display' => title_display,
127
        'title_sort' => title_sort,
128
        'author_sort' => author_sort,
129
        'electronic_access_1display' => electronic_access_1display,
130
        'pu.embargo.lift' => embargo_lift,
131
        'pu.embargo.terms' => embargo_terms,
132
        'pu.mudd.walkin' => walkin,
133
        'pu.location' => location,
134
        'dc.rights.accessRights' => access_rights,
135
        'call_number_display' => call_number_display,
136
        'call_number_browse_s' => call_number_browse_s,
137
        'language_facet' => language_facet,
138
        'language_name_display' => language_name_display
139
      }
140
      mapped = map_rest_non_special_to_solr(values)
10✔
141
      attrs.merge!(mapped)
10✔
142

143
      holdings = holdings_access(values)
10✔
144
      attrs.merge!(holdings)
10✔
145

146
      class_years = class_year_fields(values)
10✔
147
      attrs.merge!(class_years)
10✔
148

149
      attrs.merge!(HARD_CODED_TO_ADD)
10✔
150

151
      DataspaceDocument.new(document: attrs, logger: @logger)
10✔
152
    end
153

154
    # @param doc [Hash] Metadata hash with dc and pu terms
155
    # @return  The HTTP response status from Solr (??)
156
    def index_document(**values)
1✔
NEW
157
      solr_doc = build_solr_document(**values)
×
158

159
      @logger.info("Adding #{solr_doc['id']}")
×
160
      @solr.add(solr_doc, add_attributes: { commitWithin: 10 })
×
161
    rescue NoMethodError => e
162
      @logger.error(e.to_s)
×
163
      @logger.error(doc.to_s)
×
164
    rescue StandardError => e
165
      @logger.error(e.to_s)
×
166
      @logger.error(doc.to_s)
×
167
    end
168

169
    private
1✔
170

171
    def build_hash(dc_elements)
1✔
172
      date = choose_date(dc_elements)
1✔
173
      h = {
174
        'id' => id(dc_elements),
1✔
175
        'title_t' => title(dc_elements),
176
        'title_citation_display' => title(dc_elements),
177
        'title_display' => title(dc_elements),
178
        'title_sort' => title_sort(dc_elements),
179
        'author_sort' => author_sort(dc_elements),
180
        'format' => 'Senior Thesis',
181
        'pub_date_display' => date,
182
        'pub_date_start_sort' => date,
183
        'pub_date_end_sort' => date,
184
        'class_year_s' => date,
185
        'access_facet' => 'Online',
186
        'electronic_access_1display' => ark(dc_elements),
187
        'standard_no_1display' => non_ark_ids(dc_elements),
188
        'electronic_portfolio_s' => online_holding({})
189

190
      }
191
      h.merge!(map_non_special_to_solr(dc_elements))
1✔
192
      h.merge!(HARD_CODED_TO_ADD)
1✔
193
      h
1✔
194
    end
195

196
    # @return Array<REXML::Element>  the descriptive elements
197
    def pull_dc_elements(element)
1✔
198
      element.elements.to_a('oai_dc:dc/*')
1✔
199
    end
200

201
    def choose_date(dc_elements)
1✔
202
      dates = all_date_elements(dc_elements).map { |d| Chronic.parse(d.text) }
11✔
203
      dates.empty? ? nil : dates.min.year
4✔
204
    end
205

206
    def all_date_elements(dc_elements)
1✔
207
      dc_elements.select { |e| e.name == 'date' }
15✔
208
    end
209

210
    def title(dc_elements)
1✔
211
      titles = dc_elements.select { |e| e.name == 'title' }
12✔
212
      titles.empty? ? nil : titles.first.text
5✔
213
    end
214

215
    def title_sort(dc_elements)
1✔
216
      titles = dc_elements.select { |e| e.name == 'title' }
10✔
217
      title = titles.empty? ? nil : titles.first.text
5✔
218
      title.downcase.gsub(/[^\p{Alnum}\s]/, '').gsub(/^(a|an|the)\s/, '').gsub(/\s/, '') unless title.nil?
5✔
219
    end
220

221
    def ark(dc_elements)
1✔
222
      arks = dc_elements.select do |e|
4✔
223
        e.name == 'identifier' && e.text.start_with?('http://arks.princeton')
9✔
224
      end
225
      arks.empty? ? nil : { arks.first.text => dspace_display_text(dc_elements) }.to_json.to_s
4✔
226
    end
227

228
    def online_holding(doc)
1✔
229
      {
230
        'thesis' => {
8✔
231
          'call_number' => call_number(doc['dc.identifier.other']),
232
          'call_number_browse' => call_number(doc['dc.identifier.other']),
233
          'dspace' => true
234
        }
235
      }.to_json.to_s
236
    end
237

238
    def physical_holding(doc, accessible: true)
1✔
239
      {
240
        'thesis' => {
16✔
241
          'location' => 'Mudd Manuscript Library',
242
          'library' => 'Mudd Manuscript Library',
243
          'location_code' => 'mudd$stacks',
244
          'call_number' => call_number(doc['dc.identifier.other']),
245
          'call_number_browse' => call_number(doc['dc.identifier.other']),
246
          'dspace' => accessible
247
        }
248
      }.to_json.to_s
249
    end
250

251
    def non_ark_ids(dc_elements)
1✔
252
      non_ark_ids = dc_elements.select do |e|
1✔
253
        e.name == 'identifier' && !e.text.start_with?('http://arks.princeton')
1✔
254
      end
255
      return { 'Other identifier' => non_ark_ids.map(&:text) }.to_json.to_s unless non_ark_ids.empty?
1✔
256

257
      nil
258
    end
259

260
    def id(dc_elements)
1✔
261
      arks = dc_elements.select do |e|
1✔
262
        e.name == 'identifier' && e.text.start_with?('http://arks.princeton')
1✔
263
      end
264
      arks.empty? ? nil : arks.first.text.split('/').last
1✔
265
    end
266

267
    def author_sort(dc_elements)
1✔
268
      authors = dc_elements.select { |e| e.name == 'creator' }
9✔
269
      authors.empty? ? nil : authors.first.text
3✔
270
    end
271

272
    def choose_date_hash(doc)
1✔
273
      dates = all_date_elements_hash(doc).map { |_k, v| Chronic.parse(v.first) }.compact
9✔
274
      dates.empty? ? nil : dates.min.year
3✔
275
    end
276

277
    def all_date_elements_hash(doc)
1✔
278
      doc.select { |k, _v| k[/dc\.date/] }
13✔
279
    end
280

281
    def title_sort_hash(titles)
1✔
282
      titles.first.downcase.gsub(/[^\p{Alnum}\s]/, '').gsub(/^(a|an|the)\s/, '').gsub(/\s/, '') unless titles.nil?
14✔
283
    end
284

285
    # Take first title, strip out latex expressions when present to include along
286
    # with non-normalized version (allowing users to get matches both when LaTex
287
    # is pasted directly into the search box and when sub/superscripts are placed
288
    # adjacent to regular characters
289
    def title_search_hash(titles)
1✔
290
      return if titles.nil?
11✔
291

292
      title = titles.first
4✔
293
      title.scan(/\\\(.*?\\\)/).each do |latex|
4✔
294
        title = title.gsub(latex, latex.gsub(/[^\p{Alnum}]/, ''))
2✔
295
      end
296
      title == titles.first ? title : [titles.first, title]
4✔
297
    end
298

299
    def ark_hash(doc)
1✔
300
      arks = doc['dc.identifier.uri']
13✔
301
      arks.nil? ? nil : { arks.first => dspace_display_text_hash(doc) }.to_json.to_s
13✔
302
    end
303

304
    def call_number(non_ark_ids)
1✔
305
      non_ark_ids.nil? ? 'AC102' : "AC102 #{non_ark_ids.first}"
60✔
306
    end
307

308
    def first_or_nil(field)
1✔
309
      field&.first
22✔
310
    end
311

312
    def dspace_display_text(dc_elements)
1✔
313
      text = [dataspace]
2✔
314
      text << if dc_elements.select { |e| e.name == 'rights' }.empty?
8✔
315
                full_text
1✔
316
              else
317
                citation
1✔
318
              end
319
      text
2✔
320
    end
321

322
    def dspace_display_text_hash(doc)
1✔
323
      text = [dataspace]
5✔
324
      text << if on_site_only?(doc)
5✔
325
                citation
4✔
326
              else
327
                full_text
1✔
328
              end
329
      text
5✔
330
    end
331

332
    def on_site_only?(doc)
1✔
333
      doc.key?('pu.location') || doc.key?('dc.rights.accessRights') ||
24✔
334
        embargo?(doc) || walkin?(doc)
335
    end
336

337
    def embargo?(doc)
1✔
338
      date = doc['pu.embargo.lift'] || doc['pu.embargo.terms']
28✔
339
      return false if date.nil?
28✔
340

341
      date = Chronic.parse(date.first)
14✔
342
      if date.nil?
14✔
343
        @logger.info("No valid embargo date for #{doc['id']}")
2✔
344
        return false
2✔
345
      end
346

347
      date > Time.now
12✔
348
    end
349

350
    def embargo(doc)
1✔
NEW
351
      return if doc.key?('pu.embargo.lift')
×
352

UNCOV
353
      date = doc['pu.embargo.lift'] || doc['pu.embargo.terms']
×
UNCOV
354
      date = Chronic.parse(date.first) unless date.nil?
×
UNCOV
355
      date = date.strftime('%B %-d, %Y') unless date.nil?
×
UNCOV
356
      date
×
357
    end
358

359
    def walkin?(doc)
1✔
360
      walkin = doc['pu.mudd.walkin']
10✔
361
      !walkin.nil? && walkin.first == 'yes'
10✔
362
    end
363

364
    def restrictions_display_text(doc)
1✔
UNCOV
365
      if embargo?(doc)
×
UNCOV
366
        date = embargo(doc)
×
UNCOV
367
        "This content is embargoed until #{date}. For more information contact the "\
×
368
        "<a href=\"mailto:dspadmin@princeton.edu?subject=Regarding embargoed DataSpace Item 88435/#{doc['id']}\"> "\
369
        'Mudd Manuscript Library</a>.'
UNCOV
370
      elsif doc.key?('pu.location') || doc.key?('dc.rights.accessRights')
×
UNCOV
371
        [doc['pu.location'], doc['dc.rights.accessRights']].flatten.compact
×
UNCOV
372
      elsif walkin?(doc)
×
UNCOV
373
        walkin_text
×
374
      end
375
    end
376

377
    def walkin_text
1✔
UNCOV
378
      'Walk-in Access. This thesis can only be viewed on computer terminals at the '\
×
379
      '<a href=\"http://mudd.princeton.edu\">Mudd Manuscript Library</a>.'
380
    end
381

382
    def dataspace
1✔
383
      'DataSpace'
11✔
384
    end
385

386
    def full_text
1✔
387
      'Full text'
4✔
388
    end
389

390
    def citation
1✔
391
      'Citation only'
7✔
392
    end
393

394
    # this is kind of a mess...
395
    def map_non_special_to_solr(dc_elements)
1✔
396
      h = {}
3✔
397
      NON_SPECIAL_ELEMENT_MAPPING.each do |element_name, fields|
3✔
398
        elements = dc_elements.select { |e| e.name == element_name }
80✔
399
        fields.each do |f|
15✔
400
          if h.key?(f)
21✔
401
            h[f].push(*elements.map(&:text))
3✔
402
          else
403
            h[f] = elements.map(&:text)
18✔
404
          end
405
        end
406
      end
407
      h
3✔
408
    end
409

410
    # default English
411
    def code_to_language(codes)
1✔
412
      languages = []
14✔
413
      # en_US is not valid iso code
414
      codes&.each do |c|
14✔
415
        code_lang = ISO_639.find(c[/^[^_]*/]) # en_US is not valid iso code
5✔
416
        l = code_lang.nil? ? 'English' : code_lang.english_name
5✔
417
        languages << l
5✔
418
      end
419
      languages.empty? ? 'English' : languages.uniq
14✔
420
    end
421

422
    def map_rest_non_special_to_solr(doc)
1✔
423
      h = {}
12✔
424
      REST_NON_SPECIAL_ELEMENT_MAPPING.each do |field_name, solr_fields|
12✔
425
        next unless doc.key?(field_name)
84✔
426

427
        solr_fields.each do |f|
21✔
428
          val = []
35✔
429
          val << h[f]
35✔
430
          val << doc[field_name]
35✔
431
          h[f] = val.flatten.compact
35✔
432
          # Ruby might have a bug here
433
          # if h.has_key?(f)
434
          #   h[f].push(doc[field_name])
435
          # else
436
          #   h[f] = doc[field_name]
437
          # end
438
        end
439
      end
440
      h
12✔
441
    end
442

443
    def class_year_fields(doc)
1✔
444
      h = {}
15✔
445
      if doc.key?('pu.date.classyear') && doc['pu.date.classyear'].first =~ /^\d+$/
15✔
446
        h['class_year_s'] = doc['pu.date.classyear']
6✔
447
        h['pub_date_start_sort'] = doc['pu.date.classyear']
6✔
448
        h['pub_date_end_sort'] = doc['pu.date.classyear']
6✔
449
      end
450
      h
15✔
451
    end
452

453
    # online access when there isn't a restriction/location note
454
    def holdings_access(doc)
1✔
455
      if embargo?(doc)
16✔
456
        {
457
          'location' => 'Mudd Manuscript Library',
4✔
458
          'location_display' => 'Mudd Manuscript Library',
459
          'location_code_s' => 'mudd$stacks',
460
          'advanced_location_s' => ['mudd$stacks', 'Mudd Manuscript Library'],
461
          'holdings_1display' => physical_holding(doc, accessible: false)
462
        }
463
      elsif on_site_only?(doc)
12✔
464
        {
465
          'location' => 'Mudd Manuscript Library',
7✔
466
          'location_display' => 'Mudd Manuscript Library',
467
          'location_code_s' => 'mudd$stacks',
468
          'advanced_location_s' => ['mudd$stacks', 'Mudd Manuscript Library'],
469
          'access_facet' => 'In the Library',
470
          'holdings_1display' => physical_holding(doc)
471
        }
472
      else
473
        {
474
          'access_facet' => 'Online',
5✔
475
          'electronic_portfolio_s' => online_holding(doc)
476
        }
477
      end
478
    end
479
  end
480
end
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc