• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

pulibrary / orangetheses / d7643155-150f-4689-a91e-119e5fbfab15

07 Oct 2024 08:29PM UTC coverage: 22.253% (-65.1%) from 87.344%
d7643155-150f-4689-a91e-119e5fbfab15

push

circleci

christinach
Remove pry-byebug

160 of 719 relevant lines covered (22.25%)

0.22 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

23.5
/lib/orangetheses/visual.rb
1
# frozen_string_literal: true
2

3
require 'rsolr'
1✔
4
require 'rexml/document'
1✔
5
require 'chronic'
1✔
6
require 'logger'
1✔
7
require 'json'
1✔
8
require 'faraday'
1✔
9
require 'rubygems/package'
1✔
10
require 'zlib'
1✔
11
require 'tmpdir'
1✔
12

13
module Orangetheses
1✔
14
  class Visual
1✔
15
    VISUALS = 'VisualsResults.tar.gz'
1✔
16
    VISUALS_URL = "http://libweb5.princeton.edu/NewStaff/visuals/#{VISUALS}".freeze
1✔
17
    SEPARATOR = '—'
1✔
18

19
    ### Unique visual xml elements ###
20
    # id: solr id
21
    # title: title display
22
    # othertitle: other title field
23
    # imprint: publication display
24
    # unitdate: publication display
25
    # year1: pub year, sometimes centuries or uuuu
26
    # physdesc: description - size of...
27
    # note: multivalued
28
    # acqinfo: acquisitions note
29
    # creator: author display
30
    # contributor: experimenting with author display
31
    # subject: hierarchical, separated by '--'
32
    # genreform: genre - capitalize first letter
33
    ### Holdings based stuff ###
34
    # callno: call number
35
    # physicallocation: location note
36
    # link: link to file - can be multivalued
37
    # colllink: treat the same as link
38
    # collection: <holdings>: location code ctsn,ex,ga,map,mss,mudd,num,rcpxr,thx,wa,whs
39
    ######## IGNORE ########
40
    # designation: identical to callno
41
    # langcode: zxx - no language content
42
    # type: always image
43
    # source: always Visuals
44
    # year2: only 22 examples, some inconsistencies
45
    # holdings: assuming single holding
46
    # item:  <holdings>: assuming single item
47
    # primoItem: <holdings><item>: callno + physicallocation
48
    # temdata: <holdings><item>: identical to physical location
49
    NON_SPECIAL_ELEMENT_MAPPING = {
50
      'creator' => %w[author_display author_s],
1✔
51
      'contributor' => %w[author_display author_s],
52
      'physdesc' => ['description_display'],
53
      'note' => ['notes_display'],
54
      'acqinfo' => ['source_acquisition_display'],
55
      'unitdate' => ['pub_date_display'],
56
      'callno' => %w[call_number_display call_number_browse_s]
57
    }.freeze
58

59
    HARD_CODED_TO_ADD = {
1✔
60
      'format' => 'Visual material'
61
    }.freeze
62

63
    # @todo This needs to be refactored into a separate Class
64
    def self.config_file
1✔
65
      File.join(File.dirname(__FILE__), '..', '..', 'config', 'solr.yml')
×
66
    end
67

68
    def self.config_yaml
1✔
69
      ERB.new(IO.read(config_file)).result(binding)
×
70
    rescue StandardError, SyntaxError => e
71
      raise("#{config_file} was found, but could not be parsed with ERB. \n#{e.inspect}")
×
72
    end
73

74
    def self.config_values
1✔
75
      YAML.safe_load(config_yaml)
×
76
    end
77

78
    def self.env
1✔
79
      ENV['ORANGETHESES_ENV'] || 'development'
×
80
    end
81

82
    def self.config
1✔
83
      OpenStruct.new(solr: config_values[env])
×
84
    end
85

86
    def self.default_solr_url
1✔
87
      config.solr['url']
×
88
    end
89

90
    def initialize(solr_server = nil)
1✔
91
      solr_server ||= self.class.default_solr_url
×
92

93
      @tmpdir = Dir.mktmpdir
×
94
      @solr = RSolr.connect(url: solr_server, timeout: 120, open_timeout: 120)
×
95
      @logger = Logger.new($stdout)
×
96
      @logger.level = Logger::INFO
×
97
      @logger.formatter = proc do |severity, datetime, _progname, msg|
×
98
        time = datetime.strftime('%H:%M:%S')
×
99
        "[#{time}] #{severity}: #{msg}\n"
×
100
      end
101
    end
102

103
    def delete_stale_visuals
1✔
104
      @solr.delete_by_query('id:visuals*')
×
105
      @solr.commit
×
106
    end
107

108
    def process_all_visuals
1✔
109
      get_all_visuals
×
110
      Dir["#{@tmpdir}/*.xml"].each { |f| process_visual_file(f) }
×
111
    end
112

113
    private
1✔
114

115
    # rubocop:disable Naming/AccessorMethodName
116
    def get_all_visuals
1✔
117
      `curl #{VISUALS_URL} > #{@tmpdir}/#{VISUALS}`
×
118
      `tar -zxvf #{@tmpdir}/#{VISUALS} -C #{@tmpdir}`
×
119
    end
120
    # rubocop:enable Naming/AccessorMethodName
121

122
    def process_visual_file(visual)
1✔
123
      objects = []
×
124
      doc = REXML::Document.new(File.new(visual))
×
125
      doc.elements.each('*/record') { |v| objects << build_hash(v.elements.to_a) }
×
126
      @logger.info("Adding #{visual}")
×
127
      @solr.add(objects)
×
128
      @solr.commit
×
129
      objects
×
130
    end
131

132
    def build_hash(elements)
1✔
133
      location_code = get_location_code(elements)
×
134
      links = get_links(elements)
×
135
      h = {
136
        'id' => id(elements),
×
137
        'title_t' => select_element(elements, 'title'),
138
        'title_citation_display' => select_element(elements, 'title'),
139
        'title_display' => select_element(elements, 'title'),
140
        'title_sort' => title_sort(elements),
141
        'other_title_display' => select_element(elements, 'othertitle'),
142
        'other_title_index' => select_element(elements, 'othertitle'),
143
        'author_sort' => select_element(elements, 'creator'),
144
        'pub_date_start_sort' => choose_date(elements),
145
        'pub_date_end_sort' => choose_date(elements),
146
        'pub_created_display' => publication(elements),
147
        'form_genre_display' => genre(elements),
148
        'genre_facet' => genre(elements),
149
        'location_code_s' => location_code,
150
        'electronic_access_1display' => links,
151
        'access_facet' => access_facet(location_code, links)
152
      }
153
      h.merge!(location_info(location_code, elements))
×
154
      h.merge!(map_non_special_to_solr(elements))
×
155
      h.merge!(subjects_fields(elements))
×
156
      h.merge!(HARD_CODED_TO_ADD)
×
157
      related_names(h)
×
158
      h
×
159
    end
160

161
    def location_info(location_code, elements)
1✔
162
      if locations[location_code]
×
163
        {
164
          'advanced_location_s' => [location_code, get_library(location_code)],
×
165
          'location' => get_library(location_code),
166
          'holdings_1display' => holdings(elements, location_code)
167
        }
168
      else
169
        @logger.info("#{id(elements)}: Invalid location code #{location_code}")
×
170
        {}
×
171
      end
172
    end
173

174
    def related_names(doc)
1✔
175
      return unless Array(doc['author_display']).length > 4
×
176

177
      related_names = doc['author_display']
×
178
      doc['author_display'] = [related_names.shift]
×
179
      doc['related_name_json_1display'] = { 'Related name' => related_names }.to_json.to_s
×
180
    end
181

182
    def choose_date(elements)
1✔
183
      process_date(select_element(elements, 'year1'))
×
184
    end
185

186
    # @return 4 digit String or nil if uuuu
187
    def process_date(year)
1✔
188
      if year.nil?
×
189
        nil
190
      elsif year == '8981'
×
191
        '0898'
×
192
      elsif year == '173'
×
193
        '1730'
×
194
      elsif year.length == 2 # century
×
195
        "#{year.to_i - 1}00"
×
196
      elsif year.length == 3
×
197
        "0#{year}"
×
198
      elsif year == 'uuuu'
×
199
        nil
200
      else
201
        year
×
202
      end
203
    end
204

205
    def get_location_code(elements)
1✔
206
      holdings = elements.select { |e| e.name == 'holdings' }
×
207
      return nil if holdings.empty?
×
208

209
      locs = holdings.first.elements.select { |e| e.name == 'collection' }
×
210
      locs.empty? ? nil : locs.first.text
×
211
    end
212

213
    def genre(elements)
1✔
214
      titles = elements.select { |e| e.name == 'genreform' }
×
215
      titles.empty? ? nil : titles.first.text.capitalize
×
216
    end
217

218
    def title_sort(elements)
1✔
219
      titles = elements.select { |e| e.name == 'title' }
×
220
      title = titles.empty? ? nil : titles.first.text
×
221
      title.downcase.gsub(/[^\p{Alnum}\s]/, '').gsub(/^(a|an|the)\s/, '').gsub(/\s/, '') unless title.nil?
×
222
    end
223

224
    def get_links(elements)
1✔
225
      links = elements.select { |e| e.name == 'link' || e.name == 'colllink' }
×
226
      working_links = []
×
227
      links.each do |link|
×
228
        begin
229
          resource_uri = URI.parse(link.text)
×
230
        rescue StandardError
231
          link_text = link.text
×
232
          pattern = %r{(https?://)(.+?)(/.*)$}
×
233
          match = pattern.match(link_text)
×
234

235
          resource_uri = if match
×
236
                           scheme = match[1]
×
237
                           fqdn = match[2]
×
238
                           segments = match[3].split('/')
×
239
                           escaped_path = segments.map { |s| CGI.escape(s).gsub('+', '%20') }
×
240
                           path = escaped_path.join('/')
×
241
                           "#{scheme}#{fqdn}#{path}"
×
242
                         else
243
                           segments = link_text.split('/')
×
244
                           escaped = segments.map { |s| CGI.escape(s).gsub('+', '%20') }
×
245
                           escaped.join('/')
×
246
                         end
247
        end
248
        response = Faraday.get(resource_uri)
×
249
        link_status = response.status
×
250
        if link_status == 200
×
251
          working_links << link
×
252
        elsif link_status == 301
×
253
          working_links << link
×
254
          @logger.info("#{id(elements)}: Link redirect #{link.text}")
×
255
        else
256
          @logger.info("#{id(elements)}: Bad link #{link.text}")
×
257
        end
258
      end
259
      return nil if working_links.empty?
×
260

261
      link_hash = {}
×
262
      working_links.each { |l| link_hash[l.text] = [l.text.split('/').last.capitalize] }
×
263
      link_hash.to_json.to_s
×
264
    end
265

266
    def holdings(elements, location_code)
1✔
267
      holdings = {}
×
268
      holding_info = {}
×
269
      holding_info['location_code'] = location_code || 'elfvisuals'
×
270
      cn = select_element(elements, 'callno')
×
271
      holding_info['call_number'] = cn unless cn.nil?
×
272
      holding_info['call_number_browse'] = cn unless cn.nil?
×
273
      loc_note = select_element(elements, 'physicallocation')
×
274
      holding_info['location_note'] = [loc_note] unless loc_note.nil?
×
275
      if location_code.nil?
×
276
        holding_info['library'] = 'Online'
×
277
        holding_info['location'] = 'Online'
×
278
      else
279
        holding_info['library'] = get_library(location_code)
×
280
        holding_info['location'] = location_full_display(location_code)
×
281
      end
282
      holding_info['dspace'] = true
×
283
      holdings['visuals'] = holding_info
×
284
      holdings.to_json.to_s
×
285
    end
286

287
    # joins imprint and unitdate fields
288
    def publication(elements)
1✔
289
      pub = elements.select { |e| e.name == 'imprint' }
×
290
      pub = pub.empty? ? '' : pub.first.text.gsub(/[[:punct:]]$/, '')
×
291
      date = elements.select { |e| e.name == 'unitdate' }
×
292
      date = date.empty? ? '' : date.first.text
×
293
      pubdate = if pub.empty?
×
294
                  date
×
295
                elsif date.empty?
×
296
                  pub
×
297
                else
298
                  "#{pub}, #{date}"
×
299
                end
300
      pubdate.empty? ? nil : pubdate.split.map(&:capitalize).join(' ')
×
301
    end
302

303
    def id(elements)
1✔
304
      id = elements.find { |e| e.name == 'id' }.text
×
305
      "visuals#{id}"
×
306
    end
307

308
    def select_element(elements, field)
1✔
309
      element = elements.select { |e| e.name == field }
×
310
      element.empty? ? nil : element.first.text
×
311
    end
312

313
    def locations
1✔
314
      @locations ||= request_locations
×
315
    end
316

317
    def holding_locations_uri
1✔
318
      'https://bibdata.princeton.edu/locations/holding_locations.json'
×
319
    end
320

321
    def request_locations
1✔
322
      response = Faraday.get(holding_locations_uri)
×
323

324
      if response.status == 200
×
325
        @locations = {}
×
326
        JSON.parse(response.body).each do |location|
×
327
          location_code = location['code']
×
328
          @locations[location_code] = location unless location_code.nil?
×
329
        end
330
      end
331

332
      @locations
×
333
    end
334

335
    def find_location(code:)
1✔
336
      locations[code]
×
337
    end
338

339
    def get_library(code)
1✔
340
      location = find_location(code:)
×
341
      return if location.nil?
×
342

343
      location_library = location['library']
×
344
      return if location_library.nil?
×
345

346
      location_library['label']
×
347
    end
348

349
    def location_full_display(code)
1✔
350
      location = find_location(code:)
×
351
      return if location.nil?
×
352

353
      location['label'] == '' ? get_library(code) : "#{get_library(code)} - #{location['label']}"
×
354
    end
355

356
    def access_facet(location_code, links)
1✔
357
      facet = []
×
358
      facet << 'In the Library' unless location_code.nil?
×
359
      facet << 'Online' unless links.nil?
×
360
      facet
×
361
    end
362

363
    def subjects_fields(elements)
1✔
364
      subjects = elements.select { |e| e.name == 'subject' }
×
365
      return {} if subjects.empty?
×
366

367
      full_subjects = []
×
368
      split_subjects = []
×
369
      subjects.each do |s|
×
370
        full_subjects << s.text.gsub('--', SEPARATOR)
×
371
        split_subjects << s.text.split('--')
×
372
      end
373
      {
374
        'subject_facet' => full_subjects,
×
375
        'subject_display' => full_subjects,
376
        'subject_topic_facet' => split_subjects.flatten.uniq
377
      }
378
    end
379

380
    # this is kind of a mess...
381
    def map_non_special_to_solr(vis_elements)
1✔
382
      h = {}
×
383
      NON_SPECIAL_ELEMENT_MAPPING.each do |element_name, fields|
×
384
        elements = vis_elements.select { |e| e.name == element_name }
×
385
        fields.each do |f|
×
386
          if h.key?(f)
×
387
            h[f].push(*elements.map(&:text))
×
388
          else
389
            h[f] = elements.map(&:text)
×
390
          end
391
        end
392
      end
393
      collapse_single_val_arrays(h)
×
394
    end
395

396
    # rubocop:disable Naming/MethodParameterName
397
    def collapse_single_val_arrays(h)
1✔
398
      h.each do |k, v|
×
399
        h[k] = v.first if v.is_a?(Array) && v.length == 1
×
400
      end
401
      h
×
402
    end
403
    # rubocop:enable Naming/MethodParameterName
404
  end
405
end
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc