• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

pulibrary / orangetheses / 26706161-8696-498b-9937-36792a26a484

11 Oct 2024 10:04PM UTC coverage: 84.032% (-3.3%) from 87.326%
26706161-8696-498b-9937-36792a26a484

Pull #86

circleci

christinach
[#85] Remove pu. fields from def build_solr_document
 Theses fields are not included in the solr document and the theses.json file fails to index
Pull Request #86: Restructuring the generation of access restriction text during indexing and implementing a Rake Task for indexing OAI Items using a given Set ID

14 of 35 new or added lines in 4 files covered. (40.0%)

13 existing lines in 1 file now uncovered.

621 of 739 relevant lines covered (84.03%)

17.34 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

83.65
/lib/orangetheses/indexer.rb
1
# frozen_string_literal: true
2

3
require 'rsolr'
1✔
4
require 'rexml/document'
1✔
5
require 'chronic'
1✔
6
require 'logger'
1✔
7
require 'json'
1✔
8
require 'iso-639'
1✔
9
require 'yaml'
1✔
10
require 'erb'
1✔
11
require 'ostruct'
1✔
12

13
module Orangetheses
1✔
14
  class Indexer
1✔
15
    SET = 'Princeton University Senior Theses'
1✔
16

17
    NON_SPECIAL_ELEMENT_MAPPING = {
18
      'creator' => %w[author_display author_s],
1✔
19
      'contributor' => %w[advisor_display author_s],
20
      'format' => ['description_display'],
21
      'rights' => ['rights_reproductions_note_display'],
22
      'description' => ['summary_note_display']
23
    }.freeze
24

25
    REST_NON_SPECIAL_ELEMENT_MAPPING = {
26
      'dc.contributor.author' => %w[author_display author_s],
1✔
27
      'dc.contributor.advisor' => %w[advisor_display author_s],
28
      'dc.contributor' => %w[contributor_display author_s],
29
      'pu.department' => %w[department_display author_s],
30
      'pu.certificate' => %w[certificate_display author_s],
31
      'dc.format.extent' => ['description_display'],
32
      'dc.description.abstract' => ['summary_note_display']
33
    }.freeze
34

35
    HARD_CODED_TO_ADD = {
1✔
36
      'format' => 'Senior thesis'
37
    }.freeze
38

39
    # @todo This needs to be refactored into a separate Class
40
    def self.config_file
1✔
41
      File.join(File.dirname(__FILE__), '..', '..', 'config', 'solr.yml')
75✔
42
    end
43

44
    def self.config_yaml
1✔
45
      ERB.new(IO.read(config_file)).result(binding)
75✔
46
    rescue StandardError, SyntaxError => e
47
      raise("#{config_file} was found, but could not be parsed with ERB. \n#{e.inspect}")
×
48
    end
49

50
    def self.config_values
1✔
51
      YAML.safe_load(config_yaml)
75✔
52
    end
53

54
    def self.env
1✔
55
      ENV['ORANGETHESES_ENV'] || 'development'
75✔
56
    end
57

58
    def self.config
1✔
59
      OpenStruct.new(solr: config_values[env])
75✔
60
    end
61

62
    def self.default_solr_url
1✔
63
      config.solr['url']
75✔
64
    end
65

66
    def initialize(solr_server = nil)
1✔
67
      solr_server ||= self.class.default_solr_url
75✔
68

69
      @solr = RSolr.connect(url: solr_server)
75✔
70
      @logger = Logger.new($stdout)
75✔
71
      @logger.level = Logger::INFO
75✔
72
      @logger.formatter = proc do |severity, datetime, _progname, msg|
75✔
73
        time = datetime.strftime('%H:%M:%S')
2✔
74
        "[#{time}] #{severity}: #{msg}\n"
2✔
75
      end
76
    end
77

78
    # @param element  A REXML::Element (because this is what we get from the OAI gem)
79
    # @return  The HTTP response status from Solr (??)
80
    def index(metadata_element)
1✔
81
      dc_elements = pull_dc_elements(metadata_element)
×
82
      doc = build_hash(dc_elements)
×
83
      @logger.info("Adding #{doc['id']}")
×
84
      @solr.add(doc, add_attributes: { commitWithin: 10 })
×
85
    rescue NoMethodError => e
86
      @logger.error(e.to_s)
×
87
      @logger.error(metadata_element)
×
88
    rescue StandardError => e
89
      @logger.error(e.to_s)
×
90
      dc_elements.each { |element| @logger.error(element.to_s) }
×
91
    end
92

93
    # Constructs DataspaceDocument objects from a Hash of attributes
94
    # @returns [DataspaceDocument]
95
    def build_solr_document(**values)
1✔
96
      id = values['id']
9✔
97

98
      title = values['dc.title']
9✔
99
      title_t = title_search_hash(title)
9✔
100
      title_citation_display = first_or_nil(title)
9✔
101
      title_display = title_citation_display
9✔
102
      title_sort = title_sort_hash(title)
9✔
103

104
      author = values['dc.contributor.author']
9✔
105
      author_sort = first_or_nil(author)
9✔
106

107
      electronic_access_1display = ark_hash(values)
9✔
108

109
      identifier_other = values['dc.identifier.other']
9✔
110
      call_number_display = call_number(identifier_other)
9✔
111
      call_number_browse_s = call_number_display
9✔
112

113
      language_iso = values['dc.language.iso']
9✔
114
      language_facet = code_to_language(language_iso)
9✔
115
      language_name_display = language_facet
9✔
116

117
      values['pu.embargo.lift']
9✔
118
      values['pu.embargo.terms']
9✔
119
      values['pu.mudd.walkin']
9✔
120
      location = values['pu.location']
9✔
121
      values['dc.rights.accessRights']
9✔
122

123
      attrs = {
124
        'id' => id,
9✔
125
        'title_t' => title_t,
126
        'title_citation_display' => title_citation_display,
127
        'title_display' => title_display,
128
        'title_sort' => title_sort,
129
        'author_sort' => author_sort,
130
        'electronic_access_1display' => electronic_access_1display,
131
        'restrictions_note_display' => location,
132
        'call_number_display' => call_number_display,
133
        'call_number_browse_s' => call_number_browse_s,
134
        'language_facet' => language_facet,
135
        'language_name_display' => language_name_display
136
      }
137
      mapped = map_rest_non_special_to_solr(values)
9✔
138
      attrs.merge!(mapped)
9✔
139

140
      holdings = holdings_access(values)
9✔
141
      attrs.merge!(holdings)
9✔
142

143
      class_years = class_year_fields(values)
9✔
144
      attrs.merge!(class_years)
9✔
145

146
      attrs.merge!(HARD_CODED_TO_ADD)
9✔
147

148
      DataspaceDocument.new(document: attrs, logger: @logger)
9✔
149
    end
150

151
    # @param doc [Hash] Metadata hash with dc and pu terms
152
    # @return  The HTTP response status from Solr (??)
153
    def index_document(**values)
1✔
154
      solr_doc = build_solr_document(**values)
×
155

156
      @logger.info("Adding #{solr_doc['id']}")
×
157
      @solr.add(solr_doc, add_attributes: { commitWithin: 10 })
×
158
    rescue NoMethodError => e
159
      @logger.error(e.to_s)
×
160
      @logger.error(doc.to_s)
×
161
    rescue StandardError => e
162
      @logger.error(e.to_s)
×
163
      @logger.error(doc.to_s)
×
164
    end
165

166
    private
1✔
167

168
    def build_hash(dc_elements)
1✔
169
      date = choose_date(dc_elements)
1✔
170
      h = {
171
        'id' => id(dc_elements),
1✔
172
        'title_t' => title(dc_elements),
173
        'title_citation_display' => title(dc_elements),
174
        'title_display' => title(dc_elements),
175
        'title_sort' => title_sort(dc_elements),
176
        'author_sort' => author_sort(dc_elements),
177
        'format' => 'Senior Thesis',
178
        'pub_date_display' => date,
179
        'pub_date_start_sort' => date,
180
        'pub_date_end_sort' => date,
181
        'class_year_s' => date,
182
        'access_facet' => 'Online',
183
        'electronic_access_1display' => ark(dc_elements),
184
        'standard_no_1display' => non_ark_ids(dc_elements),
185
        'electronic_portfolio_s' => online_holding({})
186

187
      }
188
      h.merge!(map_non_special_to_solr(dc_elements))
1✔
189
      h.merge!(HARD_CODED_TO_ADD)
1✔
190
      h
1✔
191
    end
192

193
    # @return Array<REXML::Element>  the descriptive elements
194
    def pull_dc_elements(element)
1✔
195
      element.elements.to_a('oai_dc:dc/*')
1✔
196
    end
197

198
    def choose_date(dc_elements)
1✔
199
      dates = all_date_elements(dc_elements).map { |d| Chronic.parse(d.text) }
11✔
200
      dates.empty? ? nil : dates.min.year
4✔
201
    end
202

203
    def all_date_elements(dc_elements)
1✔
204
      dc_elements.select { |e| e.name == 'date' }
15✔
205
    end
206

207
    def title(dc_elements)
1✔
208
      titles = dc_elements.select { |e| e.name == 'title' }
12✔
209
      titles.empty? ? nil : titles.first.text
5✔
210
    end
211

212
    def title_sort(dc_elements)
1✔
213
      titles = dc_elements.select { |e| e.name == 'title' }
10✔
214
      title = titles.empty? ? nil : titles.first.text
5✔
215
      title.downcase.gsub(/[^\p{Alnum}\s]/, '').gsub(/^(a|an|the)\s/, '').gsub(/\s/, '') unless title.nil?
5✔
216
    end
217

218
    def ark(dc_elements)
1✔
219
      arks = dc_elements.select do |e|
4✔
220
        e.name == 'identifier' && e.text.start_with?('http://arks.princeton')
9✔
221
      end
222
      arks.empty? ? nil : { arks.first.text => dspace_display_text(dc_elements) }.to_json.to_s
4✔
223
    end
224

225
    def online_holding(doc)
1✔
226
      {
227
        'thesis' => {
16✔
228
          'call_number' => call_number(doc['dc.identifier.other']),
229
          'call_number_browse' => call_number(doc['dc.identifier.other']),
230
          'dspace' => true
231
        }
232
      }.to_json.to_s
233
    end
234

235
    def physical_holding(doc, accessible: true)
1✔
236
      {
237
        'thesis' => {
7✔
238
          'location' => 'Mudd Manuscript Library',
239
          'library' => 'Mudd Manuscript Library',
240
          'location_code' => 'mudd$stacks',
241
          'call_number' => call_number(doc['dc.identifier.other']),
242
          'call_number_browse' => call_number(doc['dc.identifier.other']),
243
          'dspace' => accessible
244
        }
245
      }.to_json.to_s
246
    end
247

248
    def non_ark_ids(dc_elements)
1✔
249
      non_ark_ids = dc_elements.select do |e|
1✔
250
        e.name == 'identifier' && !e.text.start_with?('http://arks.princeton')
1✔
251
      end
252
      return { 'Other identifier' => non_ark_ids.map(&:text) }.to_json.to_s unless non_ark_ids.empty?
1✔
253

254
      nil
255
    end
256

257
    def id(dc_elements)
1✔
258
      arks = dc_elements.select do |e|
1✔
259
        e.name == 'identifier' && e.text.start_with?('http://arks.princeton')
1✔
260
      end
261
      arks.empty? ? nil : arks.first.text.split('/').last
1✔
262
    end
263

264
    def author_sort(dc_elements)
1✔
265
      authors = dc_elements.select { |e| e.name == 'creator' }
9✔
266
      authors.empty? ? nil : authors.first.text
3✔
267
    end
268

269
    def choose_date_hash(doc)
1✔
270
      dates = all_date_elements_hash(doc).map { |_k, v| Chronic.parse(v.first) }.compact
9✔
271
      dates.empty? ? nil : dates.min.year
3✔
272
    end
273

274
    def all_date_elements_hash(doc)
1✔
275
      doc.select { |k, _v| k[/dc\.date/] }
13✔
276
    end
277

278
    def title_sort_hash(titles)
1✔
279
      titles.first.downcase.gsub(/[^\p{Alnum}\s]/, '').gsub(/^(a|an|the)\s/, '').gsub(/\s/, '') unless titles.nil?
13✔
280
    end
281

282
    # Take first title, strip out latex expressions when present to include along
283
    # with non-normalized version (allowing users to get matches both when LaTex
284
    # is pasted directly into the search box and when sub/superscripts are placed
285
    # adjacent to regular characters
286
    def title_search_hash(titles)
1✔
287
      return if titles.nil?
10✔
288

289
      title = titles.first
4✔
290
      title.scan(/\\\(.*?\\\)/).each do |latex|
4✔
291
        title = title.gsub(latex, latex.gsub(/[^\p{Alnum}]/, ''))
2✔
292
      end
293
      title == titles.first ? title : [titles.first, title]
4✔
294
    end
295

296
    def ark_hash(doc)
1✔
297
      arks = doc['dc.identifier.uri']
12✔
298
      arks.nil? ? nil : { arks.first => dspace_display_text_hash(doc) }.to_json.to_s
12✔
299
    end
300

301
    def call_number(non_ark_ids)
1✔
302
      non_ark_ids.nil? ? 'AC102' : "AC102 #{non_ark_ids.first}"
57✔
303
    end
304

305
    def first_or_nil(field)
1✔
306
      field&.first
20✔
307
    end
308

309
    def dspace_display_text(dc_elements)
1✔
310
      text = [dataspace]
2✔
311
      text << if dc_elements.select { |e| e.name == 'rights' }.empty?
8✔
312
                full_text
1✔
313
              else
314
                citation
1✔
315
              end
316
      text
2✔
317
    end
318

319
    def dspace_display_text_hash(doc)
1✔
320
      text = [dataspace]
5✔
321
      text << if on_site_only?(doc)
5✔
322
                citation
×
323
              else
324
                full_text
5✔
325
              end
326
      text
5✔
327
    end
328

329
    def on_site_only?(doc)
1✔
330
      output = false
29✔
331

332
      has_location = doc.key?('pu.location')
29✔
333
      output ||= has_location
29✔
334

335
      has_rights = doc.key?('pu.rights.accessRights')
29✔
336
      output ||= has_rights
29✔
337

338
      output ||= walkin?(doc)
29✔
339

340
      if output
29✔
341
        values = doc.fetch('dc.date.accessioned', [])
12✔
342
        output = if !values.empty?
12✔
343
                   accessioned = values.first
8✔
344
                   accession_date = DateTime.parse(accessioned)
8✔
345

346
                   # For theses, there is no physical copy since 2013
347
                   # anything 2012 and prior have a physical copy
348
                   # @see https://github.com/pulibrary/orangetheses/issues/76
349
                   accession_date.year < 2013
8✔
350
                 else
351
                   false
4✔
352
                 end
353
      end
354

355
      output ||= embargo?(doc)
29✔
356
      output
29✔
357
    end
358

359
    def embargo?(doc)
1✔
360
      date = doc['pu.embargo.lift'] || doc['pu.embargo.terms']
43✔
361
      return false if date.nil?
43✔
362

363
      date = Chronic.parse(date.first)
15✔
364
      if date.nil?
15✔
365
        @logger.info("No valid embargo date for #{doc['id']}")
2✔
366
        return false
2✔
367
      end
368

369
      date > Time.now
13✔
370
    end
371

372
    def embargo(doc)
1✔
373
      return if doc.key?('pu.embargo.lift')
×
374

375
      date = doc['pu.embargo.lift'] || doc['pu.embargo.terms']
×
376
      date = Chronic.parse(date.first) unless date.nil?
×
377
      date = date.strftime('%B %-d, %Y') unless date.nil?
×
378
      date
×
379
    end
380

381
    def walkin?(doc)
1✔
382
      walkin = doc['pu.mudd.walkin']
21✔
383
      !walkin.nil? && walkin.first == 'yes'
21✔
384
    end
385

386
    def build_embargo_text(doc)
1✔
NEW
387
      embargo_date = embargo(doc)
×
NEW
388
      doc_id = doc['id']
×
NEW
389
      "This content is embargoed until #{embargo_date}. For more information contact the "\
×
390
      "<a href=\"mailto:dspadmin@princeton.edu?subject=Regarding embargoed DataSpace Item 88435/#{doc_id}\"> "\
391
      'Mudd Manuscript Library</a>.'
392
    end
393

394
    def walkin_text
1✔
395
      'Walk-in Access. This thesis can only be viewed on computer terminals at the '\
×
396
      '<a href=\"http://mudd.princeton.edu\">Mudd Manuscript Library</a>.'
397
    end
398

399
    def restrictions_display_text(doc)
1✔
NEW
400
      if embargo?(doc)
×
NEW
401
        output = build_embargo_text(doc)
×
402

NEW
403
        return output
×
404
      end
405

NEW
406
      if walkin?(doc)
×
NEW
407
        output = walkin_text
×
408

NEW
409
        return output
×
410
      end
411

NEW
412
      fields = []
×
NEW
413
      if doc.key?('pu.location')
×
NEW
414
        field = doc['pu.location']
×
NEW
415
        fields << field
×
416
      end
417

NEW
418
      if doc.key?('dc.rights.accessRights')
×
NEW
419
        field = doc['pu.rights.accessRights']
×
NEW
420
        fields << field
×
421
      end
422

NEW
423
      flattened = fields.flatten
×
NEW
424
      flattened.compact
×
425
    end
426

427
    def dataspace
1✔
428
      'DataSpace'
11✔
429
    end
430

431
    def full_text
1✔
432
      'Full text'
9✔
433
    end
434

435
    def citation
1✔
436
      'Citation only'
2✔
437
    end
438

439
    # this is kind of a mess...
440
    def map_non_special_to_solr(dc_elements)
1✔
441
      h = {}
3✔
442
      NON_SPECIAL_ELEMENT_MAPPING.each do |element_name, fields|
3✔
443
        elements = dc_elements.select { |e| e.name == element_name }
80✔
444
        fields.each do |f|
15✔
445
          if h.key?(f)
21✔
446
            h[f].push(*elements.map(&:text))
3✔
447
          else
448
            h[f] = elements.map(&:text)
18✔
449
          end
450
        end
451
      end
452
      h
3✔
453
    end
454

455
    # default English
456
    def code_to_language(codes)
1✔
457
      languages = []
13✔
458
      # en_US is not valid iso code
459
      codes&.each do |c|
13✔
460
        code_lang = ISO_639.find(c[/^[^_]*/]) # en_US is not valid iso code
5✔
461
        l = code_lang.nil? ? 'English' : code_lang.english_name
5✔
462
        languages << l
5✔
463
      end
464
      languages.empty? ? 'English' : languages.uniq
13✔
465
    end
466

467
    def map_rest_non_special_to_solr(doc)
1✔
468
      h = {}
11✔
469
      REST_NON_SPECIAL_ELEMENT_MAPPING.each do |field_name, solr_fields|
11✔
470
        next unless doc.key?(field_name)
77✔
471

472
        solr_fields.each do |f|
21✔
473
          val = []
35✔
474
          val << h[f]
35✔
475
          val << doc[field_name]
35✔
476
          h[f] = val.flatten.compact
35✔
477
          # Ruby might have a bug here
478
          # if h.has_key?(f)
479
          #   h[f].push(doc[field_name])
480
          # else
481
          #   h[f] = doc[field_name]
482
          # end
483
        end
484
      end
485
      h
11✔
486
    end
487

488
    def class_year_fields(doc)
1✔
489
      h = {}
14✔
490
      if doc.key?('pu.date.classyear') && doc['pu.date.classyear'].first =~ /^\d+$/
14✔
491
        h['class_year_s'] = doc['pu.date.classyear']
6✔
492
        h['pub_date_start_sort'] = doc['pu.date.classyear']
6✔
493
        h['pub_date_end_sort'] = doc['pu.date.classyear']
6✔
494
      end
495
      h
14✔
496
    end
497

498
    # online access when there isn't a restriction/location note
499
    def holdings_access(doc)
1✔
500
      # This handles cases for items in the Mudd Library
501
      doc_embargoed = embargo?(doc)
15✔
502
      doc_on_site_only = on_site_only?(doc)
15✔
503

504
      if doc_embargoed || doc_on_site_only
15✔
505
        output = {
506
          'location' => 'Mudd Manuscript Library',
2✔
507
          'location_display' => 'Mudd Manuscript Library',
508
          'location_code_s' => 'mudd$stacks',
509
          'advanced_location_s' => ['mudd$stacks', 'Mudd Manuscript Library']
510
        }
511

512
        if doc_embargoed
2✔
513
          access_facet = nil
2✔
514
          holdings_display = physical_holding(doc, accessible: false)
2✔
515
        else
516
          access_facet = 'In the Library'
×
517
          holdings_display = physical_holding(doc)
×
518
        end
519

520
        output['access_facet'] = access_facet
2✔
521
        output['holdings_1display'] = holdings_display
2✔
522
        output
2✔
523
      else
524
        {
525
          'access_facet' => 'Online',
13✔
526
          'electronic_portfolio_s' => online_holding(doc)
527
        }
528
      end
529
    end
530
  end
531
end
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc