• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

pulibrary / pdc_discovery / 795823e8-6e3e-4881-823e-f5d46383cb0f

13 Jan 2025 07:56PM UTC coverage: 96.467% (-0.05%) from 96.52%
795823e8-6e3e-4881-823e-f5d46383cb0f

Pull #724

circleci

hectorcorrea
Made PDC JSON compatible with previous version to prevent changes throughout the code base
Pull Request #724: Fixes error indexing (very) large records from PDC Describe

8 of 11 new or added lines in 1 file covered. (72.73%)

2 existing lines in 1 file now uncovered.

3631 of 3764 relevant lines covered (96.47%)

300.19 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

90.16
/app/lib/describe_indexer.rb
1
# frozen_string_literal: true
2

3
require 'faraday_middleware'
1✔
4
require 'traject'
1✔
5
require 'open-uri'
1✔
6

7
##
8
# Fetch an RSS feed of approved works from PDC Describe. For each work, index a PDC Describe JSON resource to solr.
9
class DescribeIndexer
1✔
10
  ##
11
  # See config/pdc_discovery.yml for configuration of the RSS feed that
12
  # this indexer uses to harvest data from PDC Describe.
13
  # @param [String] rss_url
14
  def initialize(rss_url: Rails.configuration.pdc_discovery.pdc_describe_rss)
1✔
15
    @rss_url = rss_url
56✔
16
  end
17

18
  ##
19
  # Load the traject indexing config for PDC Describe JSON resources
20
  def traject_indexer
1✔
21
    Traject::Indexer::NokogiriIndexer.new.tap do |i|
927✔
22
      i.load_config_file(datacite_indexing_config_path)
927✔
23
    end
24
  end
25

26
  def datacite_indexing_config_path
1✔
27
    pathname = ::Rails.root.join('config', 'traject', "pdc_describe_indexing_config.rb")
927✔
28
    pathname.to_s
927✔
29
  end
30

31
  ##
32
  # Only index if Rails.configuration.pdc_discovery.index_pdc_describe == true
33
  # See config/pdc_discovery.yml to change this setting for a given environment.
34
  def index
1✔
35
    if Rails.configuration.pdc_discovery.index_pdc_describe == true
29✔
36
      perform_indexing
28✔
37
    else
38
      Rails.logger.warn "PDC Describe indexing is not turned on for this environment. See config/pdc_discovery.yml"
1✔
39
    end
40
  end
41

42
  # Converts the JSON payload to XML which is what Traject expects
43
  # @param [String] json
44
  # @return [String]
45
  def prep_for_indexing(json)
1✔
46
    parsed = JSON.parse(json)
881✔
47
    parsed.to_xml
881✔
48
  end
49

50
  def index_one(json)
1✔
51
    resource_xml = prep_for_indexing(json)
45✔
52
    traject_indexer.process(resource_xml)
45✔
53
    traject_indexer.complete
45✔
54
  end
55

56
  def client
1✔
57
    @client ||= Blacklight.default_index.connection
12✔
58
  end
59

60
  def delete!(query:)
1✔
61
    client.delete_by_query(query)
3✔
62
    client.commit
3✔
63
    client.optimize
3✔
64
    client
3✔
65
  end
66

67
private
1✔
68

69
  def rss_http_response
1✔
70
    URI.open(@rss_url)
28✔
71
  end
72

73
  def rss_xml_doc
1✔
74
    Nokogiri::XML(rss_http_response)
28✔
75
  end
76

77
  def rss_url_nodes
1✔
78
    rss_xml_doc.xpath("//item/url/text()")
28✔
79
  end
80

81
  def rss_url_list
1✔
82
    rss_url_nodes.map(&:to_s)
28✔
83
  end
84

85
  ##
86
  # Parse the rss_url, get a JSON resource url for each item, convert it to XML, and pass it to traject
87
  def perform_indexing
1✔
88
    urls_to_retry = []
28✔
89
    rss_url_list.each do |url|
28✔
90
      process_url(url)
836✔
91
    rescue => ex
NEW
92
      Rails.logger.warn "Indexing: Error importing record from #{url}. Will retry. Exception: #{ex.message}"
×
UNCOV
93
      urls_to_retry << url
×
94
    end
95

96
    # retry an errored urls a second time and send error only if they don't work a second time
97
    urls_to_retry.each do |url|
28✔
NEW
98
      Rails.logger.info "Indexing: Retrying record #{url}."
×
UNCOV
99
      process_url(url)
×
100
    rescue => ex
NEW
101
      Rails.logger.error "Indexing: Error importing record from #{url}. Retry failed. Exception: #{ex.message}"
×
102
      Honeybadger.notify "Error importing record from #{url}. Exception: #{ex.message}"
×
103
    end
104
  end
105

106
  def process_url(url)
1✔
107
    # Bumping the timeout to 60 seconds because datasets with lots of files (e.g. more than 30K files)
108
    # can take a while to be read (for example https://pdc-describe-prod.princeton.edu/describe/works/470.json)
109
    start_read = Time.zone.now
836✔
110
    uri = URI.open(url, open_timeout: 60, read_timeout: 60)
836✔
111
    resource_json = uri.read
836✔
112
    elapsed_read = Time.zone.now - start_read
836✔
113

114
    start_index = Time.zone.now
836✔
115
    resource_xml = prep_for_indexing(resource_json)
836✔
116
    traject_indexer.process(resource_xml)
836✔
117
    elapsed_index = Time.zone.now - start_index
836✔
118

119
    timing_info = "(read: #{format('%.2f', elapsed_read)} s, index: #{format('%.2f', elapsed_index)} s)"
836✔
120
    Rails.logger.info "Indexing: Successfully imported record from #{url}. #{timing_info} "
836✔
121
  end
122
end
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc