795823e8-6e3e-4881-823e-f5d46383cb0f

Committed 13 Jan 2025 07:56PM UTC coverage: 96.467% (-0.05%) from 96.52%

Build # 795823e8-6e3e-4881-823e-f5d46383cb0f

Build Type

Pull #724

circleci

Committed by

hectorcorrea

Commit Message

Made PDC JSON compatible with previous version to prevent changes throughout the code base

Pull Request Pull Request #724: Fixes error indexing (very) large records from PDC Describe

Run Details

8 of 11 new or added lines in 1 file covered. (72.73%)

2 existing lines in 1 file now uncovered.

3631 of 3764 relevant lines covered (96.47%)

300.19 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

90.16

/app/lib/describe_indexer.rb

# frozen_string_literal: true

require 'faraday_middleware'
require 'traject'
require 'open-uri'

##
# Fetch an RSS feed of approved works from PDC Describe. For each work, index a PDC Describe JSON resource to solr.
class DescribeIndexer
  ##
  # See config/pdc_discovery.yml for configuration of the RSS feed that
  # this indexer uses to harvest data from PDC Describe.
  # @param [String] rss_url
  def initialize(rss_url: Rails.configuration.pdc_discovery.pdc_describe_rss)
    @rss_url = rss_url
  end

  ##
  # Load the traject indexing config for PDC Describe JSON resources
  def traject_indexer
    Traject::Indexer::NokogiriIndexer.new.tap do |i|
      i.load_config_file(datacite_indexing_config_path)
    end
  end

  def datacite_indexing_config_path
    pathname = ::Rails.root.join('config', 'traject', "pdc_describe_indexing_config.rb")
    pathname.to_s
  end

  ##
  # Only index if Rails.configuration.pdc_discovery.index_pdc_describe == true
  # See config/pdc_discovery.yml to change this setting for a given environment.
  def index
    if Rails.configuration.pdc_discovery.index_pdc_describe == true
      perform_indexing
    else
      Rails.logger.warn "PDC Describe indexing is not turned on for this environment. See config/pdc_discovery.yml"
    end
  end

  # Converts the JSON payload to XML which is what Traject expects
  # @param [String] json
  # @return [String]
  def prep_for_indexing(json)
    parsed = JSON.parse(json)
    parsed.to_xml
  end

  def index_one(json)
    resource_xml = prep_for_indexing(json)
    traject_indexer.process(resource_xml)
    traject_indexer.complete
  end

  def client
    @client ||= Blacklight.default_index.connection
  end

  def delete!(query:)
    client.delete_by_query(query)
    client.commit
    client.optimize
    client
  end

private

  def rss_http_response
    URI.open(@rss_url)
  end

  def rss_xml_doc
    Nokogiri::XML(rss_http_response)
  end

  def rss_url_nodes
    rss_xml_doc.xpath("//item/url/text()")
  end

  def rss_url_list
    rss_url_nodes.map(&:to_s)
  end

  ##
  # Parse the rss_url, get a JSON resource url for each item, convert it to XML, and pass it to traject
  def perform_indexing
    urls_to_retry = []
    rss_url_list.each do |url|
      process_url(url)
    rescue => ex
      Rails.logger.warn "Indexing: Error importing record from #{url}. Will retry. Exception: #{ex.message}"
      urls_to_retry << url
    end

    # retry an errored urls a second time and send error only if they don't work a second time
    urls_to_retry.each do |url|
      Rails.logger.info "Indexing: Retrying record #{url}."
      process_url(url)
    rescue => ex
      Rails.logger.error "Indexing: Error importing record from #{url}. Retry failed. Exception: #{ex.message}"
      Honeybadger.notify "Error importing record from #{url}. Exception: #{ex.message}"
    end
  end

  def process_url(url)
    # Bumping the timeout to 60 seconds because datasets with lots of files (e.g. more than 30K files)
    # can take a while to be read (for example https://pdc-describe-prod.princeton.edu/describe/works/470.json)
    start_read = Time.zone.now
    uri = URI.open(url, open_timeout: 60, read_timeout: 60)
    resource_json = uri.read
    elapsed_read = Time.zone.now - start_read

    start_index = Time.zone.now
    resource_xml = prep_for_indexing(resource_json)
    traject_indexer.process(resource_xml)
    elapsed_index = Time.zone.now - start_index

    timing_info = "(read: #{format('%.2f', elapsed_read)} s, index: #{format('%.2f', elapsed_index)} s)"
    Rails.logger.info "Indexing: Successfully imported record from #{url}. #{timing_info} "
  end
end

1	# frozen_string_literal: true
2
3	require 'faraday_middleware'	1✔
4	require 'traject'	1✔
5	require 'open-uri'	1✔
6
7	##
8	# Fetch an RSS feed of approved works from PDC Describe. For each work, index a PDC Describe JSON resource to solr.
9	class DescribeIndexer	1✔
10	##
11	# See config/pdc_discovery.yml for configuration of the RSS feed that
12	# this indexer uses to harvest data from PDC Describe.
13	# @param [String] rss_url
14	def initialize(rss_url: Rails.configuration.pdc_discovery.pdc_describe_rss)	1✔
15	@rss_url = rss_url	56✔
16	end
17
18	##
19	# Load the traject indexing config for PDC Describe JSON resources
20	def traject_indexer	1✔
21	Traject::Indexer::NokogiriIndexer.new.tap do \|i\|	927✔
22	i.load_config_file(datacite_indexing_config_path)	927✔
23	end
24	end
25
26	def datacite_indexing_config_path	1✔
27	pathname = ::Rails.root.join('config', 'traject', "pdc_describe_indexing_config.rb")	927✔
28	pathname.to_s	927✔
29	end
30
31	##
32	# Only index if Rails.configuration.pdc_discovery.index_pdc_describe == true
33	# See config/pdc_discovery.yml to change this setting for a given environment.
34	def index	1✔
35	if Rails.configuration.pdc_discovery.index_pdc_describe == true	29✔
36	perform_indexing	28✔
37	else
38	Rails.logger.warn "PDC Describe indexing is not turned on for this environment. See config/pdc_discovery.yml"	1✔
39	end
40	end
41
42	# Converts the JSON payload to XML which is what Traject expects
43	# @param [String] json
44	# @return [String]
45	def prep_for_indexing(json)	1✔
46	parsed = JSON.parse(json)	881✔
47	parsed.to_xml	881✔
48	end
49
50	def index_one(json)	1✔
51	resource_xml = prep_for_indexing(json)	45✔
52	traject_indexer.process(resource_xml)	45✔
53	traject_indexer.complete	45✔
54	end
55
56	def client	1✔
57	@client \|\|= Blacklight.default_index.connection	12✔
58	end
59
60	def delete!(query:)	1✔
61	client.delete_by_query(query)	3✔
62	client.commit	3✔
63	client.optimize	3✔
64	client	3✔
65	end
66
67	private	1✔
68
69	def rss_http_response	1✔
70	URI.open(@rss_url)	28✔
71	end
72
73	def rss_xml_doc	1✔
74	Nokogiri::XML(rss_http_response)	28✔
75	end
76
77	def rss_url_nodes	1✔
78	rss_xml_doc.xpath("//item/url/text()")	28✔
79	end
80
81	def rss_url_list	1✔
82	rss_url_nodes.map(&:to_s)	28✔
83	end
84
85	##
86	# Parse the rss_url, get a JSON resource url for each item, convert it to XML, and pass it to traject
87	def perform_indexing	1✔
88	urls_to_retry = []	28✔
89	rss_url_list.each do \|url\|	28✔
90	process_url(url)	836✔
91	rescue => ex
NEW 92	Rails.logger.warn "Indexing: Error importing record from #{url}. Will retry. Exception: #{ex.message}"	×
UNCOV 93	urls_to_retry << url	×
94	end
95
96	# retry an errored urls a second time and send error only if they don't work a second time
97	urls_to_retry.each do \|url\|	28✔
NEW 98	Rails.logger.info "Indexing: Retrying record #{url}."	×
UNCOV 99	process_url(url)	×
100	rescue => ex
NEW 101	Rails.logger.error "Indexing: Error importing record from #{url}. Retry failed. Exception: #{ex.message}"	×
102	Honeybadger.notify "Error importing record from #{url}. Exception: #{ex.message}"	×
103	end
104	end
105
106	def process_url(url)	1✔
107	# Bumping the timeout to 60 seconds because datasets with lots of files (e.g. more than 30K files)
108	# can take a while to be read (for example https://pdc-describe-prod.princeton.edu/describe/works/470.json)
109	start_read = Time.zone.now	836✔
110	uri = URI.open(url, open_timeout: 60, read_timeout: 60)	836✔
111	resource_json = uri.read	836✔
112	elapsed_read = Time.zone.now - start_read	836✔
113
114	start_index = Time.zone.now	836✔
115	resource_xml = prep_for_indexing(resource_json)	836✔
116	traject_indexer.process(resource_xml)	836✔
117	elapsed_index = Time.zone.now - start_index	836✔
118
119	timing_info = "(read: #{format('%.2f', elapsed_read)} s, index: #{format('%.2f', elapsed_index)} s)"	836✔
120	Rails.logger.info "Indexing: Successfully imported record from #{url}. #{timing_info} "	836✔
121	end
122	end

pulibrary / pdc_discovery / 795823e8-6e3e-4881-823e-f5d46383cb0f

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous