• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

pulibrary / orangetheses / 7daabf58-5dd8-40f2-ac0d-41590d5887a5

14 Oct 2024 07:22PM UTC coverage: 86.141% (+2.1%) from 84.032%
7daabf58-5dd8-40f2-ac0d-41590d5887a5

Pull #88

circleci

christinach
[#76] check for class year to add holding
replace 'restrictions_note_display' => restrictions_display_text(values)
update indexer specs
Pull Request #88: [#76] Theses before 2012 should not be requestable

3 of 3 new or added lines in 1 file covered. (100.0%)

1 existing line in 1 file now uncovered.

634 of 736 relevant lines covered (86.14%)

17.55 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

89.62
/lib/orangetheses/indexer.rb
1
# frozen_string_literal: true
2

3
require 'rsolr'
1✔
4
require 'rexml/document'
1✔
5
require 'chronic'
1✔
6
require 'logger'
1✔
7
require 'json'
1✔
8
require 'iso-639'
1✔
9
require 'yaml'
1✔
10
require 'erb'
1✔
11
require 'ostruct'
1✔
12

13
module Orangetheses
1✔
14
  class Indexer
1✔
15
    SET = 'Princeton University Senior Theses'
1✔
16

17
    NON_SPECIAL_ELEMENT_MAPPING = {
18
      'creator' => %w[author_display author_s],
1✔
19
      'contributor' => %w[advisor_display author_s],
20
      'format' => ['description_display'],
21
      'rights' => ['rights_reproductions_note_display'],
22
      'description' => ['summary_note_display']
23
    }.freeze
24

25
    REST_NON_SPECIAL_ELEMENT_MAPPING = {
26
      'dc.contributor.author' => %w[author_display author_s],
1✔
27
      'dc.contributor.advisor' => %w[advisor_display author_s],
28
      'dc.contributor' => %w[contributor_display author_s],
29
      'pu.department' => %w[department_display author_s],
30
      'pu.certificate' => %w[certificate_display author_s],
31
      'dc.format.extent' => ['description_display'],
32
      'dc.description.abstract' => ['summary_note_display']
33
    }.freeze
34

35
    HARD_CODED_TO_ADD = {
1✔
36
      'format' => 'Senior thesis'
37
    }.freeze
38

39
    # @todo This needs to be refactored into a separate Class
40
    def self.config_file
1✔
41
      File.join(File.dirname(__FILE__), '..', '..', 'config', 'solr.yml')
75✔
42
    end
43

44
    def self.config_yaml
1✔
45
      ERB.new(IO.read(config_file)).result(binding)
75✔
46
    rescue StandardError, SyntaxError => e
47
      raise("#{config_file} was found, but could not be parsed with ERB. \n#{e.inspect}")
×
48
    end
49

50
    def self.config_values
1✔
51
      YAML.safe_load(config_yaml)
75✔
52
    end
53

54
    def self.env
1✔
55
      ENV['ORANGETHESES_ENV'] || 'development'
75✔
56
    end
57

58
    def self.config
1✔
59
      OpenStruct.new(solr: config_values[env])
75✔
60
    end
61

62
    def self.default_solr_url
1✔
63
      config.solr['url']
75✔
64
    end
65

66
    def initialize(solr_server = nil)
1✔
67
      solr_server ||= self.class.default_solr_url
75✔
68

69
      @solr = RSolr.connect(url: solr_server)
75✔
70
      @logger = Logger.new($stdout)
75✔
71
      @logger.level = Logger::INFO
75✔
72
      @logger.formatter = proc do |severity, datetime, _progname, msg|
75✔
73
        time = datetime.strftime('%H:%M:%S')
3✔
74
        "[#{time}] #{severity}: #{msg}\n"
3✔
75
      end
76
    end
77

78
    # @param element  A REXML::Element (because this is what we get from the OAI gem)
79
    # @return  The HTTP response status from Solr (??)
80
    def index(metadata_element)
1✔
81
      dc_elements = pull_dc_elements(metadata_element)
×
82
      doc = build_hash(dc_elements)
×
83
      @logger.info("Adding #{doc['id']}")
×
84
      @solr.add(doc, add_attributes: { commitWithin: 10 })
×
85
    rescue NoMethodError => e
86
      @logger.error(e.to_s)
×
87
      @logger.error(metadata_element)
×
88
    rescue StandardError => e
89
      @logger.error(e.to_s)
×
90
      dc_elements.each { |element| @logger.error(element.to_s) }
×
91
    end
92

93
    # Constructs DataspaceDocument objects from a Hash of attributes
94
    # @returns [DataspaceDocument]
95
    def build_solr_document(**values)
1✔
96
      id = values['id']
9✔
97

98
      title = values['dc.title']
9✔
99
      title_t = title_search_hash(title)
9✔
100
      title_citation_display = first_or_nil(title)
9✔
101
      title_display = title_citation_display
9✔
102
      title_sort = title_sort_hash(title)
9✔
103

104
      author = values['dc.contributor.author']
9✔
105
      author_sort = first_or_nil(author)
9✔
106

107
      electronic_access_1display = ark_hash(values)
9✔
108

109
      identifier_other = values['dc.identifier.other']
9✔
110
      call_number_display = call_number(identifier_other)
9✔
111
      call_number_browse_s = call_number_display
9✔
112

113
      language_iso = values['dc.language.iso']
9✔
114
      language_facet = code_to_language(language_iso)
9✔
115
      language_name_display = language_facet
9✔
116

117
      values['pu.embargo.lift']
9✔
118
      values['pu.embargo.terms']
9✔
119
      values['pu.mudd.walkin']
9✔
120
      values['dc.rights.accessRights']
9✔
121

122
      attrs = {
123
        'id' => id,
9✔
124
        'title_t' => title_t,
125
        'title_citation_display' => title_citation_display,
126
        'title_display' => title_display,
127
        'title_sort' => title_sort,
128
        'author_sort' => author_sort,
129
        'electronic_access_1display' => electronic_access_1display,
130
        'restrictions_note_display' => restrictions_display_text(values),
131
        'call_number_display' => call_number_display,
132
        'call_number_browse_s' => call_number_browse_s,
133
        'language_facet' => language_facet,
134
        'language_name_display' => language_name_display
135
      }
136
      mapped = map_rest_non_special_to_solr(values)
9✔
137
      attrs.merge!(mapped)
9✔
138

139
      class_years = class_year_fields(values)
9✔
140
      attrs.merge!(class_years)
9✔
141

142
      holdings = holdings_access(values)
9✔
143
      attrs.merge!(holdings)
9✔
144

145
      attrs.merge!(HARD_CODED_TO_ADD)
9✔
146

147
      DataspaceDocument.new(document: attrs, logger: @logger)
9✔
148
    end
149

150
    # @param doc [Hash] Metadata hash with dc and pu terms
151
    # @return  The HTTP response status from Solr (??)
152
    def index_document(**values)
1✔
153
      solr_doc = build_solr_document(**values)
×
154

155
      @logger.info("Adding #{solr_doc['id']}")
×
156
      @solr.add(solr_doc, add_attributes: { commitWithin: 10 })
×
157
    rescue NoMethodError => e
158
      @logger.error(e.to_s)
×
159
      @logger.error(doc.to_s)
×
160
    rescue StandardError => e
161
      @logger.error(e.to_s)
×
162
      @logger.error(doc.to_s)
×
163
    end
164

165
    private
1✔
166

167
    def build_hash(dc_elements)
1✔
168
      date = choose_date(dc_elements)
1✔
169
      h = {
170
        'id' => id(dc_elements),
1✔
171
        'title_t' => title(dc_elements),
172
        'title_citation_display' => title(dc_elements),
173
        'title_display' => title(dc_elements),
174
        'title_sort' => title_sort(dc_elements),
175
        'author_sort' => author_sort(dc_elements),
176
        'format' => 'Senior Thesis',
177
        'pub_date_display' => date,
178
        'pub_date_start_sort' => date,
179
        'pub_date_end_sort' => date,
180
        'class_year_s' => date,
181
        'access_facet' => 'Online',
182
        'electronic_access_1display' => ark(dc_elements),
183
        'standard_no_1display' => non_ark_ids(dc_elements),
184
        'electronic_portfolio_s' => online_holding({})
185

186
      }
187
      h.merge!(map_non_special_to_solr(dc_elements))
1✔
188
      h.merge!(HARD_CODED_TO_ADD)
1✔
189
      h
1✔
190
    end
191

192
    # @return Array<REXML::Element>  the descriptive elements
193
    def pull_dc_elements(element)
1✔
194
      element.elements.to_a('oai_dc:dc/*')
1✔
195
    end
196

197
    def choose_date(dc_elements)
1✔
198
      dates = all_date_elements(dc_elements).map { |d| Chronic.parse(d.text) }
11✔
199
      dates.empty? ? nil : dates.min.year
4✔
200
    end
201

202
    def all_date_elements(dc_elements)
1✔
203
      dc_elements.select { |e| e.name == 'date' }
15✔
204
    end
205

206
    def title(dc_elements)
1✔
207
      titles = dc_elements.select { |e| e.name == 'title' }
12✔
208
      titles.empty? ? nil : titles.first.text
5✔
209
    end
210

211
    def title_sort(dc_elements)
1✔
212
      titles = dc_elements.select { |e| e.name == 'title' }
10✔
213
      title = titles.empty? ? nil : titles.first.text
5✔
214
      title.downcase.gsub(/[^\p{Alnum}\s]/, '').gsub(/^(a|an|the)\s/, '').gsub(/\s/, '') unless title.nil?
5✔
215
    end
216

217
    def ark(dc_elements)
1✔
218
      arks = dc_elements.select do |e|
4✔
219
        e.name == 'identifier' && e.text.start_with?('http://arks.princeton')
9✔
220
      end
221
      arks.empty? ? nil : { arks.first.text => dspace_display_text(dc_elements) }.to_json.to_s
4✔
222
    end
223

224
    def online_holding(doc)
1✔
225
      {
226
        'thesis' => {
13✔
227
          'call_number' => call_number(doc['dc.identifier.other']),
228
          'call_number_browse' => call_number(doc['dc.identifier.other']),
229
          'dspace' => true
230
        }
231
      }.to_json.to_s
232
    end
233

234
    def physical_holding(doc, accessible: true)
1✔
235
      {
236
        'thesis' => {
10✔
237
          'location' => 'Mudd Manuscript Library',
238
          'library' => 'Mudd Manuscript Library',
239
          'location_code' => 'mudd$stacks',
240
          'call_number' => call_number(doc['dc.identifier.other']),
241
          'call_number_browse' => call_number(doc['dc.identifier.other']),
242
          'dspace' => accessible
243
        }
244
      }.to_json.to_s
245
    end
246

247
    def non_ark_ids(dc_elements)
1✔
248
      non_ark_ids = dc_elements.select do |e|
1✔
249
        e.name == 'identifier' && !e.text.start_with?('http://arks.princeton')
1✔
250
      end
251
      return { 'Other identifier' => non_ark_ids.map(&:text) }.to_json.to_s unless non_ark_ids.empty?
1✔
252

253
      nil
254
    end
255

256
    def id(dc_elements)
1✔
257
      arks = dc_elements.select do |e|
1✔
258
        e.name == 'identifier' && e.text.start_with?('http://arks.princeton')
1✔
259
      end
260
      arks.empty? ? nil : arks.first.text.split('/').last
1✔
261
    end
262

263
    def author_sort(dc_elements)
1✔
264
      authors = dc_elements.select { |e| e.name == 'creator' }
9✔
265
      authors.empty? ? nil : authors.first.text
3✔
266
    end
267

268
    def choose_date_hash(doc)
1✔
269
      dates = all_date_elements_hash(doc).map { |_k, v| Chronic.parse(v.first) }.compact
9✔
270
      dates.empty? ? nil : dates.min.year
3✔
271
    end
272

273
    def all_date_elements_hash(doc)
1✔
274
      doc.select { |k, _v| k[/dc\.date/] }
13✔
275
    end
276

277
    def title_sort_hash(titles)
1✔
278
      titles.first.downcase.gsub(/[^\p{Alnum}\s]/, '').gsub(/^(a|an|the)\s/, '').gsub(/\s/, '') unless titles.nil?
13✔
279
    end
280

281
    # Take first title, strip out latex expressions when present to include along
282
    # with non-normalized version (allowing users to get matches both when LaTex
283
    # is pasted directly into the search box and when sub/superscripts are placed
284
    # adjacent to regular characters
285
    def title_search_hash(titles)
1✔
286
      return if titles.nil?
10✔
287

288
      title = titles.first
4✔
289
      title.scan(/\\\(.*?\\\)/).each do |latex|
4✔
290
        title = title.gsub(latex, latex.gsub(/[^\p{Alnum}]/, ''))
2✔
291
      end
292
      title == titles.first ? title : [titles.first, title]
4✔
293
    end
294

295
    def ark_hash(doc)
1✔
296
      arks = doc['dc.identifier.uri']
12✔
297
      arks.nil? ? nil : { arks.first => dspace_display_text_hash(doc) }.to_json.to_s
12✔
298
    end
299

300
    def call_number(non_ark_ids)
1✔
301
      non_ark_ids.nil? ? 'AC102' : "AC102 #{non_ark_ids.first}"
57✔
302
    end
303

304
    def first_or_nil(field)
1✔
305
      field&.first
20✔
306
    end
307

308
    def dspace_display_text(dc_elements)
1✔
309
      text = [dataspace]
2✔
310
      text << if dc_elements.select { |e| e.name == 'rights' }.empty?
8✔
311
                full_text
1✔
312
              else
313
                citation
1✔
314
              end
315
      text
2✔
316
    end
317

318
    def dspace_display_text_hash(doc)
1✔
319
      text = [dataspace]
5✔
320
      text << if on_site_only?(doc)
5✔
321
                citation
3✔
322
              else
323
                full_text
2✔
324
              end
325
      text
5✔
326
    end
327

328
    def on_site_only?(doc)
1✔
329
      output = false
29✔
330

331
      has_location = doc.key?('pu.location')
29✔
332
      output ||= has_location
29✔
333

334
      has_rights = doc.key?('pu.rights.accessRights')
29✔
335
      output ||= has_rights
29✔
336

337
      output ||= walkin?(doc)
29✔
338

339
      if output
29✔
340
        values = doc.fetch('pu.date.classyear', [])
12✔
341
        output = if !values.empty?
12✔
342

343
                   classyear = values.first
8✔
344
                   # For theses, there is no physical copy since 2013
345
                   # anything 2012 and prior have a physical copy
346
                   # @see https://github.com/pulibrary/orangetheses/issues/76
347
                   classyear.to_i < 2013
8✔
348
                 else
349
                   false
4✔
350
                 end
351
      end
352
      output ||= embargo?(doc)
29✔
353
      output
29✔
354
    end
355

356
    def embargo?(doc)
1✔
357
      date = doc['pu.embargo.lift'] || doc['pu.embargo.terms']
46✔
358
      return false if date.nil?
46✔
359

360
      date = Chronic.parse(date.first)
18✔
361
      if date.nil?
18✔
362
        @logger.info("No valid embargo date for #{doc['id']}")
3✔
363
        return false
3✔
364
      end
365

366
      date > Time.now
15✔
367
    end
368

369
    def embargo(doc)
1✔
370
      date = doc['pu.embargo.lift'] || doc['pu.embargo.terms']
×
UNCOV
371
      date = Chronic.parse(date.first) unless date.nil?
×
372
      date = date.strftime('%B %-d, %Y') unless date.nil?
×
373
      date
×
374
    end
375

376
    def walkin?(doc)
1✔
377
      walkin = doc['pu.mudd.walkin']
30✔
378
      !walkin.nil? && walkin.first == 'yes'
30✔
379
    end
380

381
    def build_embargo_text(doc)
1✔
382
      embargo_date = embargo(doc)
×
383
      doc_id = doc['id']
×
384
      "This content is embargoed until #{embargo_date}. For more information contact the "\
×
385
      "<a href=\"mailto:dspadmin@princeton.edu?subject=Regarding embargoed DataSpace Item 88435/#{doc_id}\"> "\
386
      'Mudd Manuscript Library</a>.'
387
    end
388

389
    def walkin_text
1✔
390
      'Walk-in Access. This thesis can only be viewed on computer terminals at the '\
2✔
391
      '<a href=\"http://mudd.princeton.edu\">Mudd Manuscript Library</a>.'
392
    end
393

394
    def restrictions_display_text(doc)
1✔
395
      if embargo?(doc)
9✔
396
        output = build_embargo_text(doc)
×
397

398
        return output
×
399
      end
400

401
      if walkin?(doc)
9✔
402
        output = walkin_text
2✔
403

404
        return output
2✔
405
      end
406

407
      fields = []
7✔
408
      if doc.key?('pu.location')
7✔
409
        field = doc['pu.location']
3✔
410
        fields << field
3✔
411
      end
412

413
      if doc.key?('dc.rights.accessRights')
7✔
414
        field = doc['pu.rights.accessRights']
×
415
        fields << field
×
416
      end
417

418
      flattened = fields.flatten
7✔
419
      flattened.compact
7✔
420
    end
421

422
    def dataspace
1✔
423
      'DataSpace'
11✔
424
    end
425

426
    def full_text
1✔
427
      'Full text'
6✔
428
    end
429

430
    def citation
1✔
431
      'Citation only'
5✔
432
    end
433

434
    # this is kind of a mess...
435
    def map_non_special_to_solr(dc_elements)
1✔
436
      h = {}
3✔
437
      NON_SPECIAL_ELEMENT_MAPPING.each do |element_name, fields|
3✔
438
        elements = dc_elements.select { |e| e.name == element_name }
80✔
439
        fields.each do |f|
15✔
440
          if h.key?(f)
21✔
441
            h[f].push(*elements.map(&:text))
3✔
442
          else
443
            h[f] = elements.map(&:text)
18✔
444
          end
445
        end
446
      end
447
      h
3✔
448
    end
449

450
    # default English
451
    def code_to_language(codes)
1✔
452
      languages = []
13✔
453
      # en_US is not valid iso code
454
      codes&.each do |c|
13✔
455
        code_lang = ISO_639.find(c[/^[^_]*/]) # en_US is not valid iso code
5✔
456
        l = code_lang.nil? ? 'English' : code_lang.english_name
5✔
457
        languages << l
5✔
458
      end
459
      languages.empty? ? 'English' : languages.uniq
13✔
460
    end
461

462
    def map_rest_non_special_to_solr(doc)
1✔
463
      h = {}
11✔
464
      REST_NON_SPECIAL_ELEMENT_MAPPING.each do |field_name, solr_fields|
11✔
465
        next unless doc.key?(field_name)
77✔
466

467
        solr_fields.each do |f|
21✔
468
          val = []
35✔
469
          val << h[f]
35✔
470
          val << doc[field_name]
35✔
471
          h[f] = val.flatten.compact
35✔
472
          # Ruby might have a bug here
473
          # if h.has_key?(f)
474
          #   h[f].push(doc[field_name])
475
          # else
476
          #   h[f] = doc[field_name]
477
          # end
478
        end
479
      end
480
      h
11✔
481
    end
482

483
    def class_year_fields(doc)
1✔
484
      h = {}
14✔
485
      if doc.key?('pu.date.classyear') && doc['pu.date.classyear'].first =~ /^\d+$/
14✔
486
        h['class_year_s'] = doc['pu.date.classyear']
6✔
487
        h['pub_date_start_sort'] = doc['pu.date.classyear']
6✔
488
        h['pub_date_end_sort'] = doc['pu.date.classyear']
6✔
489
      end
490
      h
14✔
491
    end
492

493
    # online access when there isn't a restriction/location note
494
    def holdings_access(doc)
1✔
495
      # This handles cases for items in the Mudd Library
496
      doc_embargoed = embargo?(doc)
15✔
497
      doc_on_site_only = on_site_only?(doc)
15✔
498

499
      if doc_embargoed || doc_on_site_only
15✔
500
        output = {
501
          'location' => 'Mudd Manuscript Library',
5✔
502
          'location_display' => 'Mudd Manuscript Library',
503
          'location_code_s' => 'mudd$stacks',
504
          'advanced_location_s' => ['mudd$stacks', 'Mudd Manuscript Library']
505
        }
506

507
        if doc_embargoed
5✔
508
          access_facet = nil
2✔
509
          holdings_display = physical_holding(doc, accessible: false)
2✔
510
        else
511
          access_facet = 'In the Library'
3✔
512
          holdings_display = physical_holding(doc)
3✔
513
        end
514

515
        output['access_facet'] = access_facet
5✔
516
        output['holdings_1display'] = holdings_display
5✔
517
        output
5✔
518
      else
519
        {
520
          'access_facet' => 'Online',
10✔
521
          'electronic_portfolio_s' => online_holding(doc)
522
        }
523
      end
524
    end
525
  end
526
end
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc