• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

pulibrary / orangetheses / 38778a1e-2dd3-4ec3-b4d6-a6c4d301682a

07 Dec 2023 05:02PM UTC coverage: 87.536% (+0.6%) from 86.964%
38778a1e-2dd3-4ec3-b4d6-a6c4d301682a

push

circleci

web-flow
Merge pull request #79 from pulibrary/i77-jrgriffiniii-date-display

Updates Rubocop, refactors DataSpace Solr Document generation, and ensures that invalid embargo dates trigger log warnings

190 of 196 new or added lines in 5 files covered. (96.94%)

12 existing lines in 1 file now uncovered.

604 of 690 relevant lines covered (87.54%)

18.31 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

95.56
/lib/orangetheses/fetcher.rb
1
# frozen_string_literal: true
2

3
require 'faraday'
1✔
4
require 'json'
1✔
5
require 'tmpdir'
1✔
6
require 'openssl'
1✔
7
require 'retriable'
1✔
8
require 'logger'
1✔
9

10
# Do not fail if SSL negotiation with DSpace isn't working
11
OpenSSL::SSL::VERIFY_PEER = OpenSSL::SSL::VERIFY_NONE
1✔
12

13
module Orangetheses
1✔
14
  class Fetcher
1✔
15
    attr_writer :logger
1✔
16

17
    def self.config_file_path
1✔
18
      File.join(File.dirname(__FILE__), '..', '..', 'config', 'dspace.yml')
39✔
19
    end
20

21
    def self.config_file
1✔
22
      IO.read(config_file_path)
39✔
23
    end
24

25
    def self.config_erb
1✔
26
      ERB.new(config_file).result(binding)
39✔
27
    rescue StandardError, SyntaxError => e
NEW
28
      raise("#{config_file} was found, but could not be parsed with ERB. \n#{e.inspect}")
×
29
    end
30

31
    def self.config_yaml
1✔
32
      YAML.safe_load(config_erb, aliases: true)
39✔
33
    end
34

35
    def self.env
1✔
36
      ENV['ORANGETHESES_ENV'] || 'development'
39✔
37
    end
38

39
    def self.env_config
1✔
40
      config_yaml[env]
39✔
41
    end
42

43
    def self.default_server
1✔
44
      env_config['server']
13✔
45
    end
46

47
    def self.default_community
1✔
48
      env_config['community']
13✔
49
    end
50

51
    def self.default_rest_limit
1✔
52
      env_config['rest_limit']
13✔
53
    end
54

55
    # @param [Hash] opts  options to pass to the client
56
    # @option opts [String] :server ('https://dataspace.princeton.edu/rest/')
57
    # @option opts [String] :community ('88435/dsp019c67wm88m')
58
    def initialize(server: nil, community: nil, rest_limit: nil)
1✔
59
      @server = server || self.class.default_server
13✔
60
      @community = community || self.class.default_community
13✔
61

62
      @rest_limit = rest_limit || self.class.default_rest_limit
13✔
63
    end
64

65
    def logger
1✔
66
      @logger ||= begin
12✔
67
        built = Logger.new($stdout)
3✔
68
        built.level = Logger::DEBUG
3✔
69
        built
3✔
70
      end
71
    end
72

73
    ##
74
    # Where files get cached for later indexing
75
    def json_file_path
1✔
76
      @json_file_path ||= ENV['FILEPATH'] || '/tmp/theses.json'
3✔
77
    end
78

79
    ##
80
    # Write to the log anytime an API call fails and we have to retry.
81
    # See https://github.com/kamui/retriable#callbacks for more information.
82
    def log_retries
1✔
83
      proc do |exception, try, elapsed_time, next_interval|
7✔
84
        logger.debug "#{exception.class}: '#{exception.message}' - #{try} tries in #{elapsed_time} seconds and #{next_interval} seconds until the next try."
5✔
85
      end
86
    end
87

88
    ##
89
    # @param id [String] thesis collection id
90
    # @return [Array<Hash>] metadata hash for each record
91
    def fetch_collection(id)
1✔
92
      theses = []
4✔
93
      offset = 0
4✔
94
      completed = false
4✔
95

96
      until completed
4✔
97
        url = build_collection_url(id:, offset:)
7✔
98
        logger.debug("Querying for the DSpace Collection at #{url}...")
7✔
99
        Retriable.retriable(on: JSON::ParserError, tries: Orangetheses::RETRY_LIMIT, on_retry: log_retries) do
7✔
100
          response = api_client.get(url)
11✔
101
          items = JSON.parse(response.body)
11✔
102
          if items.empty?
6✔
103
            completed = true
3✔
104
          else
105
            theses << flatten_json(items)
3✔
106
            offset += @rest_limit
3✔
107
          end
108
        end
109
      end
110
      theses.flatten
3✔
111
    end
112

113
    def index_collection(indexer, id)
1✔
114
      fetched = fetch_collection(id)
×
115
      fetched.each do |record|
×
116
        indexer.index_hash(record)
×
117
      end
118
    end
119

120
    def index_all_collections(indexer)
1✔
121
      collections.each do |c|
×
122
        index_collection(indexer, c)
×
123
      end
124
    end
125

126
    ##
127
    # Cache all collections
128
    def cache_all_collections(indexer)
1✔
129
      solr_documents = []
4✔
130

131
      collections.each do |collection_id|
4✔
132
        collection_documents = cache_collection(indexer, collection_id)
2✔
133
        solr_documents += collection_documents
2✔
134
      end
135

136
      solr_documents.flatten
4✔
137
    end
138

139
    ##
140
    # Cache a single collection
141
    def cache_collection(indexer, collection_id)
1✔
142
      solr_documents = []
3✔
143

144
      elements = fetch_collection(collection_id)
3✔
145
      elements.each do |attrs|
3✔
146
        solr_document = indexer.build_solr_document(**attrs)
3✔
147
        solr_documents << solr_document
3✔
148
      end
149

150
      solr_documents
3✔
151
    end
152

153
    ##
154
    # Get a json representation of a single collection and write it as JSON to
155
    # a cache file.
156
    def self.write_collection_to_cache(collection_id)
1✔
157
      indexer = Indexer.new
1✔
158
      fetcher = Fetcher.new
1✔
159
      File.open(fetcher.json_file_path, 'w') do |f|
1✔
160
        documents = fetcher.cache_collection(indexer, collection_id)
1✔
161
        solr_documents = documents.map(&:to_solr)
1✔
162
        json_cache = JSON.pretty_generate(solr_documents)
1✔
163
        f.puts(json_cache)
1✔
164
      end
165
    end
166

167
    ##
168
    # Get a json representation of all thesis collections and write it as JSON to
169
    # a cache file.
170
    def self.write_all_collections_to_cache
1✔
171
      indexer = Indexer.new
1✔
172
      fetcher = Fetcher.new
1✔
173
      File.open(fetcher.json_file_path, 'w') do |f|
1✔
174
        documents = fetcher.cache_all_collections(indexer)
1✔
175
        solr_documents = documents.map(&:to_solr)
1✔
176
        json_cache = JSON.pretty_generate(solr_documents)
1✔
177
        f.puts(json_cache)
1✔
178
      end
179
    end
180

181
    ##
182
    # The DSpace id of the community we're fetching content for.
183
    # E.g., for handle '88435/dsp019c67wm88m', the DSpace id is 267
184
    def api_community_id
1✔
185
      @api_community_id ||= api_community['id'].to_s
5✔
186
    end
187

188
    private
1✔
189

190
    def build_collection_url(id:, offset:)
1✔
191
      "#{@server}/collections/#{id}/items?limit=#{@rest_limit}&offset=#{offset}&expand=metadata"
7✔
192
    end
193

194
    def flatten_json(items)
1✔
195
      items.collect do |i|
8✔
196
        h = {}
8✔
197
        h['id'] = i['handle'][%r{[^/]*$}]
8✔
198
        i['metadata'].each do |m|
8✔
199
          m['value'] = map_department(m['value']) if m['key'] == 'pu.department'
145✔
200
          m['value'] = map_program(m['value']) if m['key'] == 'pu.certificate'
145✔
201
          next if m['value'].nil?
145✔
202

203
          if h[m['key']].nil?
135✔
204
            h[m['key']] = [m['value']]
121✔
205
          else
206
            h[m['key']] << m['value']
14✔
207
          end
208
        end
209
        h
8✔
210
      end
211
    end
212

213
    def api_client
1✔
214
      Faraday
20✔
215
    end
216

217
    def api_communities
1✔
218
      @api_communities ||= begin
5✔
219
        response = api_client.get("#{@server}/communities/")
5✔
220
        response.body
5✔
221
      end
222
    end
223

224
    ##
225
    # Parse the JSON feed containing all of the communities, and return only the
226
    # community that matches the handle.
227
    # @return [JSON] a json representation of the DSpace community
228
    def api_community
1✔
229
      @api_community ||= JSON.parse(api_communities).find { |c| c['handle'] == @community }
106✔
230
    end
231

232
    ##
233
    # Get all of the collections for a given community
234
    def api_collections
1✔
235
      @api_collections ||= begin
4✔
236
        response = api_client.get("#{@server}/communities/#{api_community_id}/collections")
4✔
237
        response.body
4✔
238
      end
239
    end
240

241
    ##
242
    # All of the collections for a given community, parsed as JSON
243
    def api_collections_json
1✔
244
      @api_collections_json ||= JSON.parse(api_collections)
4✔
245
    end
246

247
    def collections
1✔
248
      @collections ||= api_collections_json.map { |i| i['id'] }
6✔
249
    end
250

251
    def map_department(dept)
1✔
252
      lc_authorized_departments[dept]
13✔
253
    end
254

255
    def map_program(program)
1✔
256
      lc_authorized_programs[program]
10✔
257
    end
258

259
    def lc_authorized_departments
1✔
260
      {
13✔
261
        'African American Studies' => 'Princeton University. Department of African American Studies',
262
        'Art and Archaeology' => 'Princeton University. Department of Art and Archaeology',
263
        'Aeronautical Engineering' => 'Princeton University. Department of Aeronautical Engineering',
264
        'Anthropology' => 'Princeton University. Department of Anthropology',
265
        'Architecture School' => 'Princeton University. School of Architecture',
266
        'Astrophysical Sciences' => 'Princeton University. Department of Astrophysical Sciences',
267
        'Biochemical Sciences' => 'Princeton University. Department of Biochemical Sciences',
268
        'Biology' => 'Princeton University. Department of Biology',
269
        'Civil and Environmental Engineering' => 'Princeton University. Department of Civil and Environmental Engineering',
270
        'Civil Engineering and Operations Research' => 'Princeton University. Department of Civil Engineering and Operations Research',
271
        'Chemical and Biological Engineering' => 'Princeton University. Department of Chemical and Biological Engineering',
272
        'Chemistry' => 'Princeton University. Department of Chemistry',
273
        'Classics' => 'Princeton University. Department of Classics',
274
        'Comparative Literature' => 'Princeton University. Department of Comparative Literature',
275
        'Computer Science' => 'Princeton University. Department of Computer Science',
276
        'East Asian Studies' => 'Princeton University. Department of East Asian Studies',
277
        'Economics' => 'Princeton University. Department of Economics',
278
        'Ecology and Evolutionary Biology' => 'Princeton University. Department of Ecology and Evolutionary Biology',
279
        'Electrical Engineering' => 'Princeton University. Department of Electrical Engineering',
280
        'Engineering and Applied Science' => 'Princeton University. School of Engineering and Applied Science',
281
        'English' => 'Princeton University. Department of English',
282
        'French and Italian' => 'Princeton University. Department of French and Italian',
283
        'Geosciences' => 'Princeton University. Department of Geosciences',
284
        'German' => 'Princeton University. Department of Germanic Languages and Literatures',
285
        'History' => 'Princeton University. Department of History',
286
        'Special Program in Humanities' => 'Princeton University. Special Program in the Humanities',
287
        'Independent Concentration' => 'Princeton University Independent Concentration Program',
288
        'Mathematics' => 'Princeton University. Department of Mathematics',
289
        'Molecular Biology' => 'Princeton University. Department of Molecular Biology',
290
        'Mechanical and Aerospace Engineering' => 'Princeton University. Department of Mechanical and Aerospace Engineering',
291
        'Medieval Studies' => 'Princeton University. Program in Medieval Studies',
292
        'Modern Languages' => 'Princeton University. Department of Modern Languages.',
293
        'Music' => 'Princeton University. Department of Music',
294
        'Near Eastern Studies' => 'Princeton University. Department of Near Eastern Studies',
295
        'Neuroscience' => 'Princeton Neuroscience Institute',
296
        'Operations Research and Financial Engineering' => 'Princeton University. Department of Operations Research and Financial Engineering',
297
        'Oriental Studies' => 'Princeton University. Department of Oriental Studies',
298
        'Philosophy' => 'Princeton University. Department of Philosophy',
299
        'Physics' => 'Princeton University. Department of Physics',
300
        'Politics' => 'Princeton University. Department of Politics',
301
        'Psychology' => 'Princeton University. Department of Psychology',
302
        'Religion' => 'Princeton University. Department of Religion',
303
        'Romance Languages and Literatures' => 'Princeton University. Department of Romance Languages and Literatures',
304
        'Slavic Languages and Literature' => 'Princeton University. Department of Slavic Languages and Literatures',
305
        'Sociology' => 'Princeton University. Department of Sociology',
306
        'Spanish and Portuguese' => 'Princeton University. Department of Spanish and Portuguese Languages and Cultures',
307
        'Spanish and Portuguese Languages and Cultures' => 'Princeton University. Department of Spanish and Portuguese Languages and Cultures',
308
        'Statistics' => 'Princeton University. Department of Statistics',
309
        'School of Public and International Affairs' => 'School of Public and International Affairs'
310
      }
311
    end
312

313
    def lc_authorized_programs
1✔
314
      {
10✔
315
        'African American Studies Program' => 'Princeton University. Program in African-American Studies',
316
        'African Studies Program' => 'Princeton University. Program in African Studies',
317
        'American Studies Program' => 'Princeton University. Program in American Studies',
318
        'Applications of Computing Program' => 'Princeton University. Program in Applications of Computing',
319
        'Architecture and Engineering Program' => 'Princeton University. Program in Architecture and Engineering',
320
        'Center for Statistics and Machine Learning' => 'Princeton University. Center for Statistics and Machine Learning',
321
        'Creative Writing Program' => 'Princeton University. Creative Writing Program',
322
        'East Asian Studies Program' => 'Princeton University. Program in East Asian Studies',
323
        'Engineering Biology Program' => 'Princeton University. Program in Engineering Biology',
324
        'Engineering and Management Systems Program' => 'Princeton University. Program in Engineering and Management Systems',
325
        'Environmental Studies Program' => 'Princeton University. Program in Environmental Studies',
326
        'Ethnographic Studies Program' => 'Princeton University. Program in Ethnographic Studies',
327
        'European Cultural Studies Program' => 'Princeton University. Program in European Cultural Studies',
328
        'Finance Program' => 'Princeton University. Program in Finance',
329
        'Geological Engineering Program' => 'Princeton University. Program in Geological Engineering',
330
        'Global Health and Health Policy Program' => 'Princeton University. Program in Global Health and Health Policy',
331
        'Hellenic Studies Program' => 'Princeton University. Program in Hellenic Studies',
332
        'Humanities Council and Humanistic Studies Program' => 'Princeton University. Program in Humanistic Studies',
333
        'Judaic Studies Program' => 'Princeton University. Program in Judaic Studies',
334
        'Latin American Studies Program' => 'Princeton University. Program in Latin American Studies',
335
        'Latino Studies Program' => 'Princeton University. Program in Latino Studies',
336
        'Linguistics Program' => 'Princeton University. Program in Linguistics',
337
        'Materials Science and Engineering Program' => 'Princeton University. Program in Materials Science and Engineering',
338
        'Medieval Studies Program' => 'Princeton University. Program in Medieval Studies',
339
        'Near Eastern Studies Program' => 'Princeton University. Program in Near Eastern Studies',
340
        'Neuroscience Program' => 'Princeton University. Program in Neuroscience',
341
        'Program in Cognitive Science' => 'Princeton University. Program in Cognitive Science',
342
        'Program in Entrepreneurship' => 'Princeton University. Program in Entrepreneurship',
343
        'Program in Gender and Sexuality Studies' => 'Princeton University. Program in Gender and Sexuality Studies',
344
        'Program in Music Theater' => 'Princeton University. Program in Music Theater',
345
        'Program in Technology & Society, Technology Track' => 'Princeton University. Program in Technology and Society',
346
        'Program in Values and Public Life' => 'Princeton University. Program in Values and Public Life',
347
        'Quantitative and Computational Biology Program' => 'Princeton University. Program in Quantitative and Computational Biology',
348
        'Robotics & Intelligent Systems Program' => 'Princeton University. Program in Robotics and Intelligent Systems',
349
        'Russian & Eurasian Studies Program' => 'Princeton University. Program in Russian, East European and Eurasian Studies',
350
        'South Asian Studies Program' => 'Princeton University. Program in South Asian Studies',
351
        'Theater' => 'Princeton University. Program in Theater',
352
        'Theater Program' => 'Princeton University. Program in Theater',
353
        'Sustainable Energy Program' => 'Princeton University. Program in Sustainable Energy',
354
        'Urban Studies Program' => 'Princeton University. Program in Urban Studies'
355
      }
356
    end
357
  end
358
end
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc