• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

pulibrary / orangetheses / d7643155-150f-4689-a91e-119e5fbfab15

07 Oct 2024 08:29PM UTC coverage: 22.253% (-65.1%) from 87.344%
d7643155-150f-4689-a91e-119e5fbfab15

push

circleci

christinach
Remove pry-byebug

160 of 719 relevant lines covered (22.25%)

0.22 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

4.96
/lib/orangetheses/fetcher.rb
1
# frozen_string_literal: true
2

3
require 'faraday'
1✔
4
require 'json'
1✔
5
require 'tmpdir'
1✔
6
require 'openssl'
1✔
7
require 'retriable'
1✔
8
require 'logger'
1✔
9
require 'pry'
1✔
10

11
# Do not fail if SSL negotiation with DSpace isn't working
12
OpenSSL::SSL::VERIFY_PEER = OpenSSL::SSL::VERIFY_NONE
×
13

14
module Orangetheses
×
15
  class Fetcher
×
16
    attr_writer :logger
×
17

18
    def self.config_file_path
×
19
      File.join(File.dirname(__FILE__), '..', '..', 'config', 'dspace.yml')
×
20
    end
21

22
    def self.config_file
×
23
      IO.read(config_file_path)
×
24
    end
25

26
    def self.config_erb
×
27
      ERB.new(config_file).result(binding)
×
28
    rescue StandardError, SyntaxError => e
29
      raise("#{config_file} was found, but could not be parsed with ERB. \n#{e.inspect}")
×
30
    end
31

32
    def self.config_yaml
×
33
      YAML.safe_load(config_erb, aliases: true)
×
34
    end
35

36
    def self.env
×
37
      ENV['ORANGETHESES_ENV'] || 'development'
×
38
    end
39

40
    def self.env_config
×
41
      config_yaml[env]
×
42
    end
43

44
    def self.default_server
×
45
      env_config['server']
×
46
    end
47

48
    def self.default_community
×
49
      env_config['community']
×
50
    end
51

52
    def self.default_rest_limit
×
53
      env_config['rest_limit']
×
54
    end
55

56
    # @param [Hash] opts  options to pass to the client
57
    # @option opts [String] :server ('https://dataspace.princeton.edu/rest/')
58
    # @option opts [String] :community ('88435/dsp019c67wm88m')
59
    def initialize(server: nil, community: nil, rest_limit: nil)
×
60
      @server = server || self.class.default_server
×
61
      @community = community || self.class.default_community
×
62

63
      @rest_limit = rest_limit || self.class.default_rest_limit
×
64
    end
65

66
    def logger
×
67
      @logger ||= begin
×
68
        built = Logger.new($stdout)
×
69
        built.level = Logger::DEBUG
×
70
        built
×
71
      end
72
    end
73

74
    ##
75
    # Where files get cached for later indexing
76
    def json_file_path
×
77
      @json_file_path ||= ENV['FILEPATH'] || '/tmp/theses.json'
×
78
    end
79

80
    ##
81
    # Write to the log anytime an API call fails and we have to retry.
82
    # See https://github.com/kamui/retriable#callbacks for more information.
83
    def log_retries
×
84
      proc do |exception, try, elapsed_time, next_interval|
×
85
        logger.debug "#{exception.class}: '#{exception.message}' - #{try} tries in #{elapsed_time} seconds and #{next_interval} seconds until the next try."
×
86
      end
87
    end
88

89
    ##
90
    # @param id [String] thesis collection id
91
    # @return [Array<Hash>] metadata hash for each record
92
    def fetch_collection(id)
×
93
      theses = []
×
94
      offset = 0
×
95
      completed = false
×
96

97
      until completed
×
98
        url = build_collection_url(id:, offset:)
×
99
        logger.debug("Querying for the DSpace Collection at #{url}...")
×
100
        Retriable.retriable(on: JSON::ParserError, tries: Orangetheses::RETRY_LIMIT, on_retry: log_retries) do
×
101
          response = api_client.get(url)
×
102
          items = JSON.parse(response.body)
×
103
          if items.empty?
×
104
            completed = true
×
105
          else
106
            theses << flatten_json(items)
×
107
            offset += @rest_limit
×
108
          end
109
        end
110
      end
111
      theses.flatten
×
112
    end
113

114
    def index_collection(indexer, id)
×
115
      fetched = fetch_collection(id)
×
116
      fetched.each do |record|
×
117
        indexer.index_hash(record)
×
118
      end
119
    end
120

121
    def index_all_collections(indexer)
×
122
      collections.each do |c|
×
123
        index_collection(indexer, c)
×
124
      end
125
    end
126

127
    ##
128
    # Cache all collections
129
    def cache_all_collections(indexer)
×
130
      solr_documents = []
×
131

132
      collections.each do |collection_id|
×
133
        collection_documents = cache_collection(indexer, collection_id)
×
134
        solr_documents += collection_documents
×
135
      end
136

137
      solr_documents.flatten
×
138
    end
139

140
    ##
141
    # Cache a single collection
142
    def cache_collection(indexer, collection_id)
×
143
      solr_documents = []
×
144

145
      elements = fetch_collection(collection_id)
×
146
      elements.each do |attrs|
×
147
        solr_document = indexer.build_solr_document(**attrs)
×
148
        solr_documents << solr_document
×
149
      end
150

151
      solr_documents
×
152
    end
153

154
    ##
155
    # Get a json representation of a single collection and write it as JSON to
156
    # a cache file.
157
    def self.write_collection_to_cache(collection_id)
×
158
      indexer = Indexer.new
×
159
      fetcher = Fetcher.new
×
160
      File.open(fetcher.json_file_path, 'w') do |f|
×
161
        documents = fetcher.cache_collection(indexer, collection_id)
×
162
        solr_documents = documents.map(&:to_solr)
×
163
        json_cache = JSON.pretty_generate(solr_documents)
×
164
        f.puts(json_cache)
×
165
      end
166
    end
167

168
    ##
169
    # Get a json representation of all thesis collections and write it as JSON to
170
    # a cache file.
171
    def self.write_all_collections_to_cache
×
172
      indexer = Indexer.new
×
173
      fetcher = Fetcher.new
×
174
      File.open(fetcher.json_file_path, 'w') do |f|
×
175
        documents = fetcher.cache_all_collections(indexer)
×
176
        solr_documents = documents.map(&:to_solr)
×
177
        json_cache = JSON.pretty_generate(solr_documents)
×
178
        f.puts(json_cache)
×
179
      end
180
    end
181

182
    ##
183
    # The DSpace id of the community we're fetching content for.
184
    # E.g., for handle '88435/dsp019c67wm88m', the DSpace id is 267
185
    def api_community_id
×
186
      @api_community_id ||= api_community['id'].to_s
×
187
    end
188

189
    private
×
190

191
    def build_collection_url(id:, offset:)
×
192
      "#{@server}/collections/#{id}/items?limit=#{@rest_limit}&offset=#{offset}&expand=metadata"
×
193
    end
194

195
    def flatten_json(items)
×
196
      items.collect do |i|
×
197
        h = {}
×
198
        h['id'] = i['handle'][%r{[^/]*$}]
×
199
        i['metadata'].each do |m|
×
200
          m['value'] = map_department(m['value']) if m['key'] == 'pu.department'
×
201
          m['value'] = map_program(m['value']) if m['key'] == 'pu.certificate'
×
202
          next if m['value'].nil?
×
203

204
          if h[m['key']].nil?
×
205
            h[m['key']] = [m['value']]
×
206
          else
207
            h[m['key']] << m['value']
×
208
          end
209
        end
210
        h
×
211
      end
212
    end
213

214
    def api_client
×
215
      Faraday
×
216
    end
217

218
    def api_communities
×
219
      @api_communities ||= begin
×
220
        response = api_client.get("#{@server}/communities/")
×
221
        response.body
×
222
      rescue StandardError => e
223
        Faraday.logger.warn(e)
×
224
        '[]'
×
225
      end
226
    end
227

228
    def json_api_communities
×
229
      @json_api_communities ||= JSON.parse(api_communities)
×
230
    end
231

232
    ##
233
    # Parse the JSON feed containing all of the communities, and return only the
234
    # community that matches the handle.
235
    # @return [JSON] a json representation of the DSpace community
236
    def api_community
×
237
      return if json_api_communities.empty?
×
238

239
      @api_community ||= json_api_communities.find { |c| c['handle'] == @community }
×
240
    end
241

242
    ##
243
    # Get all of the collections for a given community
244
    def api_collections
×
245
      @api_collections ||= begin
×
246
        response = api_client.get("#{@server}/communities/#{api_community_id}/collections")
×
247
        response.body
×
248
      end
249
    end
250

251
    ##
252
    # All of the collections for a given community, parsed as JSON
253
    def api_collections_json
×
254
      @api_collections_json ||= JSON.parse(api_collections)
×
255
    end
256

257
    def collections
×
258
      @collections ||= api_collections_json.map { |i| i['id'] }
×
259
    end
260

261
    def map_department(dept)
×
262
      lc_authorized_departments[dept]
×
263
    end
264

265
    def map_program(program)
×
266
      lc_authorized_programs[program]
×
267
    end
268

269
    def lc_authorized_departments
×
270
      {
×
271
        'African American Studies' => 'Princeton University. Department of African American Studies',
272
        'Art and Archaeology' => 'Princeton University. Department of Art and Archaeology',
273
        'Aeronautical Engineering' => 'Princeton University. Department of Aeronautical Engineering',
274
        'Anthropology' => 'Princeton University. Department of Anthropology',
275
        'Architecture School' => 'Princeton University. School of Architecture',
276
        'Astrophysical Sciences' => 'Princeton University. Department of Astrophysical Sciences',
277
        'Biochemical Sciences' => 'Princeton University. Department of Biochemical Sciences',
278
        'Biology' => 'Princeton University. Department of Biology',
279
        'Civil and Environmental Engineering' => 'Princeton University. Department of Civil and Environmental Engineering',
280
        'Civil Engineering and Operations Research' => 'Princeton University. Department of Civil Engineering and Operations Research',
281
        'Chemical and Biological Engineering' => 'Princeton University. Department of Chemical and Biological Engineering',
282
        'Chemistry' => 'Princeton University. Department of Chemistry',
283
        'Classics' => 'Princeton University. Department of Classics',
284
        'Comparative Literature' => 'Princeton University. Department of Comparative Literature',
285
        'Computer Science' => 'Princeton University. Department of Computer Science',
286
        'East Asian Studies' => 'Princeton University. Department of East Asian Studies',
287
        'Economics' => 'Princeton University. Department of Economics',
288
        'Ecology and Evolutionary Biology' => 'Princeton University. Department of Ecology and Evolutionary Biology',
289
        'Electrical Engineering' => 'Princeton University. Department of Electrical Engineering',
290
        'Engineering and Applied Science' => 'Princeton University. School of Engineering and Applied Science',
291
        'English' => 'Princeton University. Department of English',
292
        'French and Italian' => 'Princeton University. Department of French and Italian',
293
        'Geosciences' => 'Princeton University. Department of Geosciences',
294
        'German' => 'Princeton University. Department of Germanic Languages and Literatures',
295
        'History' => 'Princeton University. Department of History',
296
        'Special Program in Humanities' => 'Princeton University. Special Program in the Humanities',
297
        'Independent Concentration' => 'Princeton University Independent Concentration Program',
298
        'Mathematics' => 'Princeton University. Department of Mathematics',
299
        'Molecular Biology' => 'Princeton University. Department of Molecular Biology',
300
        'Mechanical and Aerospace Engineering' => 'Princeton University. Department of Mechanical and Aerospace Engineering',
301
        'Medieval Studies' => 'Princeton University. Program in Medieval Studies',
302
        'Modern Languages' => 'Princeton University. Department of Modern Languages.',
303
        'Music' => 'Princeton University. Department of Music',
304
        'Near Eastern Studies' => 'Princeton University. Department of Near Eastern Studies',
305
        'Neuroscience' => 'Princeton Neuroscience Institute',
306
        'Operations Research and Financial Engineering' => 'Princeton University. Department of Operations Research and Financial Engineering',
307
        'Oriental Studies' => 'Princeton University. Department of Oriental Studies',
308
        'Philosophy' => 'Princeton University. Department of Philosophy',
309
        'Physics' => 'Princeton University. Department of Physics',
310
        'Politics' => 'Princeton University. Department of Politics',
311
        'Psychology' => 'Princeton University. Department of Psychology',
312
        'Religion' => 'Princeton University. Department of Religion',
313
        'Romance Languages and Literatures' => 'Princeton University. Department of Romance Languages and Literatures',
314
        'Slavic Languages and Literature' => 'Princeton University. Department of Slavic Languages and Literatures',
315
        'Sociology' => 'Princeton University. Department of Sociology',
316
        'Spanish and Portuguese' => 'Princeton University. Department of Spanish and Portuguese Languages and Cultures',
317
        'Spanish and Portuguese Languages and Cultures' => 'Princeton University. Department of Spanish and Portuguese Languages and Cultures',
318
        'Statistics' => 'Princeton University. Department of Statistics',
319
        'School of Public and International Affairs' => 'School of Public and International Affairs'
320
      }
321
    end
322

323
    def lc_authorized_programs
×
324
      {
×
325
        'African American Studies Program' => 'Princeton University. Program in African-American Studies',
326
        'African Studies Program' => 'Princeton University. Program in African Studies',
327
        'American Studies Program' => 'Princeton University. Program in American Studies',
328
        'Applications of Computing Program' => 'Princeton University. Program in Applications of Computing',
329
        'Architecture and Engineering Program' => 'Princeton University. Program in Architecture and Engineering',
330
        'Center for Statistics and Machine Learning' => 'Princeton University. Center for Statistics and Machine Learning',
331
        'Creative Writing Program' => 'Princeton University. Creative Writing Program',
332
        'East Asian Studies Program' => 'Princeton University. Program in East Asian Studies',
333
        'Engineering Biology Program' => 'Princeton University. Program in Engineering Biology',
334
        'Engineering and Management Systems Program' => 'Princeton University. Program in Engineering and Management Systems',
335
        'Environmental Studies Program' => 'Princeton University. Program in Environmental Studies',
336
        'Ethnographic Studies Program' => 'Princeton University. Program in Ethnographic Studies',
337
        'European Cultural Studies Program' => 'Princeton University. Program in European Cultural Studies',
338
        'Finance Program' => 'Princeton University. Program in Finance',
339
        'Geological Engineering Program' => 'Princeton University. Program in Geological Engineering',
340
        'Global Health and Health Policy Program' => 'Princeton University. Program in Global Health and Health Policy',
341
        'Hellenic Studies Program' => 'Princeton University. Program in Hellenic Studies',
342
        'Humanities Council and Humanistic Studies Program' => 'Princeton University. Program in Humanistic Studies',
343
        'Judaic Studies Program' => 'Princeton University. Program in Judaic Studies',
344
        'Latin American Studies Program' => 'Princeton University. Program in Latin American Studies',
345
        'Latino Studies Program' => 'Princeton University. Program in Latino Studies',
346
        'Linguistics Program' => 'Princeton University. Program in Linguistics',
347
        'Materials Science and Engineering Program' => 'Princeton University. Program in Materials Science and Engineering',
348
        'Medieval Studies Program' => 'Princeton University. Program in Medieval Studies',
349
        'Near Eastern Studies Program' => 'Princeton University. Program in Near Eastern Studies',
350
        'Neuroscience Program' => 'Princeton University. Program in Neuroscience',
351
        'Program in Cognitive Science' => 'Princeton University. Program in Cognitive Science',
352
        'Program in Entrepreneurship' => 'Princeton University. Program in Entrepreneurship',
353
        'Program in Gender and Sexuality Studies' => 'Princeton University. Program in Gender and Sexuality Studies',
354
        'Program in Music Theater' => 'Princeton University. Program in Music Theater',
355
        'Program in Technology & Society, Technology Track' => 'Princeton University. Program in Technology and Society',
356
        'Program in Values and Public Life' => 'Princeton University. Program in Values and Public Life',
357
        'Quantitative and Computational Biology Program' => 'Princeton University. Program in Quantitative and Computational Biology',
358
        'Robotics & Intelligent Systems Program' => 'Princeton University. Program in Robotics and Intelligent Systems',
359
        'Russian & Eurasian Studies Program' => 'Princeton University. Program in Russian, East European and Eurasian Studies',
360
        'South Asian Studies Program' => 'Princeton University. Program in South Asian Studies',
361
        'Theater' => 'Princeton University. Program in Theater',
362
        'Theater Program' => 'Princeton University. Program in Theater',
363
        'Sustainable Energy Program' => 'Princeton University. Program in Sustainable Energy',
364
        'Urban Studies Program' => 'Princeton University. Program in Urban Studies'
365
      }
366
    end
367
  end
368
end
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc