• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

pulibrary / orangetheses / d7643155-150f-4689-a91e-119e5fbfab15

07 Oct 2024 08:29PM UTC coverage: 22.253% (-65.1%) from 87.344%
d7643155-150f-4689-a91e-119e5fbfab15

push

circleci

christinach
Remove pry-byebug

160 of 719 relevant lines covered (22.25%)

0.22 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

24.3
/lib/orangetheses/indexer.rb
1
# frozen_string_literal: true
2

3
require 'rsolr'
1✔
4
require 'rexml/document'
1✔
5
require 'chronic'
1✔
6
require 'logger'
1✔
7
require 'json'
1✔
8
require 'iso-639'
1✔
9
require 'yaml'
1✔
10
require 'erb'
1✔
11
require 'ostruct'
1✔
12

13
module Orangetheses
1✔
14
  class Indexer
1✔
15
    SET = 'Princeton University Senior Theses'
1✔
16

17
    NON_SPECIAL_ELEMENT_MAPPING = {
18
      'creator' => %w[author_display author_s],
1✔
19
      'contributor' => %w[advisor_display author_s],
20
      'format' => ['description_display'],
21
      'rights' => ['rights_reproductions_note_display'],
22
      'description' => ['summary_note_display']
23
    }.freeze
24

25
    REST_NON_SPECIAL_ELEMENT_MAPPING = {
26
      'dc.contributor.author' => %w[author_display author_s],
1✔
27
      'dc.contributor.advisor' => %w[advisor_display author_s],
28
      'dc.contributor' => %w[contributor_display author_s],
29
      'pu.department' => %w[department_display author_s],
30
      'pu.certificate' => %w[certificate_display author_s],
31
      'dc.format.extent' => ['description_display'],
32
      'dc.description.abstract' => ['summary_note_display']
33
    }.freeze
34

35
    HARD_CODED_TO_ADD = {
1✔
36
      'format' => 'Senior thesis'
37
    }.freeze
38

39
    # @todo This needs to be refactored into a separate Class
40
    def self.config_file
1✔
41
      File.join(File.dirname(__FILE__), '..', '..', 'config', 'solr.yml')
×
42
    end
43

44
    def self.config_yaml
1✔
45
      ERB.new(IO.read(config_file)).result(binding)
×
46
    rescue StandardError, SyntaxError => e
47
      raise("#{config_file} was found, but could not be parsed with ERB. \n#{e.inspect}")
×
48
    end
49

50
    def self.config_values
1✔
51
      YAML.safe_load(config_yaml)
×
52
    end
53

54
    def self.env
1✔
55
      ENV['ORANGETHESES_ENV'] || 'development'
×
56
    end
57

58
    def self.config
1✔
59
      OpenStruct.new(solr: config_values[env])
×
60
    end
61

62
    def self.default_solr_url
1✔
63
      config.solr['url']
×
64
    end
65

66
    def initialize(solr_server = nil)
1✔
67
      solr_server ||= self.class.default_solr_url
×
68

69
      @solr = RSolr.connect(url: solr_server)
×
70
      @logger = Logger.new($stdout)
×
71
      @logger.level = Logger::INFO
×
72
      @logger.formatter = proc do |severity, datetime, _progname, msg|
×
73
        time = datetime.strftime('%H:%M:%S')
×
74
        "[#{time}] #{severity}: #{msg}\n"
×
75
      end
76
    end
77

78
    # @param element  A REXML::Element (because this is what we get from the OAI gem)
79
    # @return  The HTTP response status from Solr (??)
80
    def index(metadata_element)
1✔
81
      dc_elements = pull_dc_elements(metadata_element)
×
82
      doc = build_hash(dc_elements)
×
83
      @logger.info("Adding #{doc['id']}")
×
84
      @solr.add(doc, add_attributes: { commitWithin: 10 })
×
85
    rescue NoMethodError => e
86
      @logger.error(e.to_s)
×
87
      @logger.error(metadata_element)
×
88
    rescue StandardError => e
89
      @logger.error(e.to_s)
×
90
      dc_elements.each { |element| @logger.error(element.to_s) }
×
91
    end
92

93
    # Constructs DataspaceDocument objects from a Hash of attributes
94
    # @returns [DataspaceDocument]
95
    def build_solr_document(**values)
1✔
96
      id = values['id']
×
97

98
      title = values['dc.title']
×
99
      title_t = title_search_hash(title)
×
100
      title_citation_display = first_or_nil(title)
×
101
      title_display = title_citation_display
×
102
      title_sort = title_sort_hash(title)
×
103

104
      author = values['dc.contributor.author']
×
105
      author_sort = first_or_nil(author)
×
106

107
      electronic_access_1display = ark_hash(values)
×
108

109
      identifier_other = values['dc.identifier.other']
×
110
      call_number_display = call_number(identifier_other)
×
111
      call_number_browse_s = call_number_display
×
112

113
      language_iso = values['dc.language.iso']
×
114
      language_facet = code_to_language(language_iso)
×
115
      language_name_display = language_facet
×
116

117
      embargo_lift = values['pu.embargo.lift']
×
118
      embargo_terms = values['pu.embargo.terms']
×
119
      walkin = values['pu.mudd.walkin']
×
120
      location = values['pu.location']
×
121
      access_rights = values['dc.rights.accessRights']
×
122

123
      attrs = {
124
        'id' => id,
×
125
        'title_t' => title_t,
126
        'title_citation_display' => title_citation_display,
127
        'title_display' => title_display,
128
        'title_sort' => title_sort,
129
        'author_sort' => author_sort,
130
        'electronic_access_1display' => electronic_access_1display,
131
        'pu.embargo.lift' => embargo_lift,
132
        'pu.embargo.terms' => embargo_terms,
133
        'pu.mudd.walkin' => walkin,
134
        'pu.location' => location,
135
        'dc.rights.accessRights' => access_rights,
136
        'call_number_display' => call_number_display,
137
        'call_number_browse_s' => call_number_browse_s,
138
        'language_facet' => language_facet,
139
        'language_name_display' => language_name_display
140
      }
141
      mapped = map_rest_non_special_to_solr(values)
×
142
      attrs.merge!(mapped)
×
143

144
      holdings = holdings_access(values)
×
145
      attrs.merge!(holdings)
×
146

147
      class_years = class_year_fields(values)
×
148
      attrs.merge!(class_years)
×
149

150
      attrs.merge!(HARD_CODED_TO_ADD)
×
151

152
      DataspaceDocument.new(document: attrs, logger: @logger)
×
153
    end
154

155
    # @param doc [Hash] Metadata hash with dc and pu terms
156
    # @return  The HTTP response status from Solr (??)
157
    def index_document(**values)
1✔
158
      solr_doc = build_solr_document(**values)
×
159

160
      @logger.info("Adding #{solr_doc['id']}")
×
161
      @solr.add(solr_doc, add_attributes: { commitWithin: 10 })
×
162
    rescue NoMethodError => e
163
      @logger.error(e.to_s)
×
164
      @logger.error(doc.to_s)
×
165
    rescue StandardError => e
166
      @logger.error(e.to_s)
×
167
      @logger.error(doc.to_s)
×
168
    end
169

170
    private
1✔
171

172
    def build_hash(dc_elements)
1✔
173
      date = choose_date(dc_elements)
×
174
      h = {
175
        'id' => id(dc_elements),
×
176
        'title_t' => title(dc_elements),
177
        'title_citation_display' => title(dc_elements),
178
        'title_display' => title(dc_elements),
179
        'title_sort' => title_sort(dc_elements),
180
        'author_sort' => author_sort(dc_elements),
181
        'format' => 'Senior Thesis',
182
        'pub_date_display' => date,
183
        'pub_date_start_sort' => date,
184
        'pub_date_end_sort' => date,
185
        'class_year_s' => date,
186
        'access_facet' => 'Online',
187
        'electronic_access_1display' => ark(dc_elements),
188
        'standard_no_1display' => non_ark_ids(dc_elements),
189
        'electronic_portfolio_s' => online_holding({})
190

191
      }
192
      h.merge!(map_non_special_to_solr(dc_elements))
×
193
      h.merge!(HARD_CODED_TO_ADD)
×
194
      h
×
195
    end
196

197
    # @return Array<REXML::Element>  the descriptive elements
198
    def pull_dc_elements(element)
1✔
199
      element.elements.to_a('oai_dc:dc/*')
×
200
    end
201

202
    def choose_date(dc_elements)
1✔
203
      dates = all_date_elements(dc_elements).map { |d| Chronic.parse(d.text) }
×
204
      dates.empty? ? nil : dates.min.year
×
205
    end
206

207
    def all_date_elements(dc_elements)
1✔
208
      dc_elements.select { |e| e.name == 'date' }
×
209
    end
210

211
    def title(dc_elements)
1✔
212
      titles = dc_elements.select { |e| e.name == 'title' }
×
213
      titles.empty? ? nil : titles.first.text
×
214
    end
215

216
    def title_sort(dc_elements)
1✔
217
      titles = dc_elements.select { |e| e.name == 'title' }
×
218
      title = titles.empty? ? nil : titles.first.text
×
219
      title.downcase.gsub(/[^\p{Alnum}\s]/, '').gsub(/^(a|an|the)\s/, '').gsub(/\s/, '') unless title.nil?
×
220
    end
221

222
    def ark(dc_elements)
1✔
223
      arks = dc_elements.select do |e|
×
224
        e.name == 'identifier' && e.text.start_with?('http://arks.princeton')
×
225
      end
226
      arks.empty? ? nil : { arks.first.text => dspace_display_text(dc_elements) }.to_json.to_s
×
227
    end
228

229
    def online_holding(doc)
1✔
230
      {
231
        'thesis' => {
×
232
          'call_number' => call_number(doc['dc.identifier.other']),
233
          'call_number_browse' => call_number(doc['dc.identifier.other']),
234
          'dspace' => true
235
        }
236
      }.to_json.to_s
237
    end
238

239
    def physical_holding(doc, accessible: true)
1✔
240
      {
241
        'thesis' => {
×
242
          'location' => 'Mudd Manuscript Library',
243
          'library' => 'Mudd Manuscript Library',
244
          'location_code' => 'mudd$stacks',
245
          'call_number' => call_number(doc['dc.identifier.other']),
246
          'call_number_browse' => call_number(doc['dc.identifier.other']),
247
          'dspace' => accessible
248
        }
249
      }.to_json.to_s
250
    end
251

252
    def non_ark_ids(dc_elements)
1✔
253
      non_ark_ids = dc_elements.select do |e|
×
254
        e.name == 'identifier' && !e.text.start_with?('http://arks.princeton')
×
255
      end
256
      return { 'Other identifier' => non_ark_ids.map(&:text) }.to_json.to_s unless non_ark_ids.empty?
×
257

258
      nil
259
    end
260

261
    def id(dc_elements)
1✔
262
      arks = dc_elements.select do |e|
×
263
        e.name == 'identifier' && e.text.start_with?('http://arks.princeton')
×
264
      end
265
      arks.empty? ? nil : arks.first.text.split('/').last
×
266
    end
267

268
    def author_sort(dc_elements)
1✔
269
      authors = dc_elements.select { |e| e.name == 'creator' }
×
270
      authors.empty? ? nil : authors.first.text
×
271
    end
272

273
    def choose_date_hash(doc)
1✔
274
      dates = all_date_elements_hash(doc).map { |_k, v| Chronic.parse(v.first) }.compact
×
275
      dates.empty? ? nil : dates.min.year
×
276
    end
277

278
    def all_date_elements_hash(doc)
1✔
279
      doc.select { |k, _v| k[/dc\.date/] }
×
280
    end
281

282
    def title_sort_hash(titles)
1✔
283
      titles.first.downcase.gsub(/[^\p{Alnum}\s]/, '').gsub(/^(a|an|the)\s/, '').gsub(/\s/, '') unless titles.nil?
×
284
    end
285

286
    # Take first title, strip out latex expressions when present to include along
287
    # with non-normalized version (allowing users to get matches both when LaTex
288
    # is pasted directly into the search box and when sub/superscripts are placed
289
    # adjacent to regular characters
290
    def title_search_hash(titles)
1✔
291
      return if titles.nil?
×
292

293
      title = titles.first
×
294
      title.scan(/\\\(.*?\\\)/).each do |latex|
×
295
        title = title.gsub(latex, latex.gsub(/[^\p{Alnum}]/, ''))
×
296
      end
297
      title == titles.first ? title : [titles.first, title]
×
298
    end
299

300
    def ark_hash(doc)
1✔
301
      arks = doc['dc.identifier.uri']
×
302
      arks.nil? ? nil : { arks.first => dspace_display_text_hash(doc) }.to_json.to_s
×
303
    end
304

305
    def call_number(non_ark_ids)
1✔
306
      non_ark_ids.nil? ? 'AC102' : "AC102 #{non_ark_ids.first}"
×
307
    end
308

309
    def first_or_nil(field)
1✔
310
      field&.first
×
311
    end
312

313
    def dspace_display_text(dc_elements)
1✔
314
      text = [dataspace]
×
315
      text << if dc_elements.select { |e| e.name == 'rights' }.empty?
×
316
                full_text
×
317
              else
318
                citation
×
319
              end
320
      text
×
321
    end
322

323
    def dspace_display_text_hash(doc)
1✔
324
      text = [dataspace]
×
325
      text << if on_site_only?(doc)
×
326
                citation
×
327
              else
328
                full_text
×
329
              end
330
      text
×
331
    end
332

333
    def on_site_only?(doc)
1✔
334
      output = false
×
335

336
      has_location = doc.key?('pu.location')
×
337
      output ||= has_location
×
338

339
      has_rights = doc.key?('pu.rights.accessRights')
×
340
      output ||= has_rights
×
341

342
      output ||= walkin?(doc)
×
343

344
      if output
×
345
        values = doc.fetch('dc.date.accessioned', [])
×
346
        output = if !values.empty?
×
347
                   accessioned = values.first
×
348
                   accession_date = DateTime.parse(accessioned)
×
349

350
                   # For theses, there is no physical copy since 2013
351
                   # anything 2012 and prior have a physical copy
352
                   # @see https://github.com/pulibrary/orangetheses/issues/76
353
                   accession_date.year < 2013
×
354
                 else
355
                   false
×
356
                 end
357
      end
358

359
      output ||= embargo?(doc)
×
360
      output
×
361
    end
362

363
    def embargo?(doc)
1✔
364
      date = doc['pu.embargo.lift'] || doc['pu.embargo.terms']
×
365
      return false if date.nil?
×
366

367
      date = Chronic.parse(date.first)
×
368
      if date.nil?
×
369
        @logger.info("No valid embargo date for #{doc['id']}")
×
370
        return false
×
371
      end
372

373
      date > Time.now
×
374
    end
375

376
    def embargo(doc)
1✔
377
      return if doc.key?('pu.embargo.lift')
×
378

379
      date = doc['pu.embargo.lift'] || doc['pu.embargo.terms']
×
380
      date = Chronic.parse(date.first) unless date.nil?
×
381
      date = date.strftime('%B %-d, %Y') unless date.nil?
×
382
      date
×
383
    end
384

385
    def walkin?(doc)
1✔
386
      walkin = doc['pu.mudd.walkin']
×
387
      !walkin.nil? && walkin.first == 'yes'
×
388
    end
389

390
    def restrictions_display_text(doc)
1✔
391
      if embargo?(doc)
×
392
        date = embargo(doc)
×
393
        "This content is embargoed until #{date}. For more information contact the "\
×
394
        "<a href=\"mailto:dspadmin@princeton.edu?subject=Regarding embargoed DataSpace Item 88435/#{doc['id']}\"> "\
395
        'Mudd Manuscript Library</a>.'
396
      elsif doc.key?('pu.location') || doc.key?('dc.rights.accessRights')
×
397
        [doc['pu.location'], doc['dc.rights.accessRights']].flatten.compact
×
398
      elsif walkin?(doc)
×
399
        walkin_text
×
400
      end
401
    end
402

403
    def walkin_text
1✔
404
      'Walk-in Access. This thesis can only be viewed on computer terminals at the '\
×
405
      '<a href=\"http://mudd.princeton.edu\">Mudd Manuscript Library</a>.'
406
    end
407

408
    def dataspace
1✔
409
      'DataSpace'
×
410
    end
411

412
    def full_text
1✔
413
      'Full text'
×
414
    end
415

416
    def citation
1✔
417
      'Citation only'
×
418
    end
419

420
    # this is kind of a mess...
421
    def map_non_special_to_solr(dc_elements)
1✔
422
      h = {}
×
423
      NON_SPECIAL_ELEMENT_MAPPING.each do |element_name, fields|
×
424
        elements = dc_elements.select { |e| e.name == element_name }
×
425
        fields.each do |f|
×
426
          if h.key?(f)
×
427
            h[f].push(*elements.map(&:text))
×
428
          else
429
            h[f] = elements.map(&:text)
×
430
          end
431
        end
432
      end
433
      h
×
434
    end
435

436
    # default English
437
    def code_to_language(codes)
1✔
438
      languages = []
×
439
      # en_US is not valid iso code
440
      codes&.each do |c|
×
441
        code_lang = ISO_639.find(c[/^[^_]*/]) # en_US is not valid iso code
×
442
        l = code_lang.nil? ? 'English' : code_lang.english_name
×
443
        languages << l
×
444
      end
445
      languages.empty? ? 'English' : languages.uniq
×
446
    end
447

448
    def map_rest_non_special_to_solr(doc)
1✔
449
      h = {}
×
450
      REST_NON_SPECIAL_ELEMENT_MAPPING.each do |field_name, solr_fields|
×
451
        next unless doc.key?(field_name)
×
452

453
        solr_fields.each do |f|
×
454
          val = []
×
455
          val << h[f]
×
456
          val << doc[field_name]
×
457
          h[f] = val.flatten.compact
×
458
          # Ruby might have a bug here
459
          # if h.has_key?(f)
460
          #   h[f].push(doc[field_name])
461
          # else
462
          #   h[f] = doc[field_name]
463
          # end
464
        end
465
      end
466
      h
×
467
    end
468

469
    def class_year_fields(doc)
1✔
470
      h = {}
×
471
      if doc.key?('pu.date.classyear') && doc['pu.date.classyear'].first =~ /^\d+$/
×
472
        h['class_year_s'] = doc['pu.date.classyear']
×
473
        h['pub_date_start_sort'] = doc['pu.date.classyear']
×
474
        h['pub_date_end_sort'] = doc['pu.date.classyear']
×
475
      end
476
      h
×
477
    end
478

479
    # online access when there isn't a restriction/location note
480
    def holdings_access(doc)
1✔
481
      # This handles cases for items in the Mudd Library
482
      doc_embargoed = embargo?(doc)
×
483
      doc_on_site_only = on_site_only?(doc)
×
484

485
      if doc_embargoed || doc_on_site_only
×
486
        output = {
487
          'location' => 'Mudd Manuscript Library',
×
488
          'location_display' => 'Mudd Manuscript Library',
489
          'location_code_s' => 'mudd$stacks',
490
          'advanced_location_s' => ['mudd$stacks', 'Mudd Manuscript Library']
491
        }
492

493
        if doc_embargoed
×
494
          access_facet = nil
×
495
          holdings_display = physical_holding(doc, accessible: false)
×
496
        else
497
          access_facet = 'In the Library'
×
498
          holdings_display = physical_holding(doc)
×
499
        end
500

501
        output['access_facet'] = access_facet
×
502
        output['holdings_1display'] = holdings_display
×
503
        output
×
504
      else
505
        {
506
          'access_facet' => 'Online',
×
507
          'electronic_portfolio_s' => online_holding(doc)
508
        }
509
      end
510
    end
511
  end
512
end
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc