529bb1cf-d0bf-4901-97c6-12679aea8117

Committed 17 Aug 2023 01:30PM UTC coverage: 90.265% (-6.3%) from 96.544%

Build # 529bb1cf-d0bf-4901-97c6-12679aea8117

Build Type

Pull #478

circleci

Committed by

carolyncole

Commit Message

Updates to css to fix tests after bundle update
Also ran rubocop -A && Prettier

Pull Request Pull Request #478: Switching to selenium to fix CI

Run Details

2 of 2 new or added lines in 2 files covered. (100.0%)

2142 of 2373 relevant lines covered (90.27%)

93.35 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

47.37

/app/lib/dspace_research_data_harvester.rb

# frozen_string_literal: true

require 'csv'

##
# Harvest research data from DataSpace for indexing
class DspaceResearchDataHarvester
  COLLECTION_CONFIG = Rails.root.join('config', 'collections.csv')
  REST_LIMIT = 100
  CACHE_COMMUNITIES_FILE = Rails.root.join('spec', 'fixtures', 'files', 'dataspace_communities.json')

  def collections_to_index(collection_config = COLLECTION_CONFIG)
    collections = []
    CSV.foreach(collection_config, quote_char: '"', col_sep: ',', row_sep: :auto, headers: true) do |row|
      rdc = ResearchDataCollection.new(row)
      collections << rdc
    end
    collections
  end

  def server
    "#{Rails.configuration.pdc_discovery.dataspace_url}/rest"
  end

  ##
  # For a given ResearchDataCollection, retrieve its metadata from DataSpace
  # @param [ResearchDataCollection] collection
  def harvest(collection)
    collection_id = collection.collection_id
    url = "#{server}/collections/#{collection_id}/items?limit=#{REST_LIMIT}&offset=0&expand=all"

    resp = Faraday.get(url, {}, { 'Accept': 'application/xml' })
    DspaceIndexer.new(resp.body).index
  end

  ##
  # Convenience method to harvest and index all collections in the config file
  # @example
  #   DspaceResearchDataHarvester.harvest
  def self.harvest(use_cache = false)
    Rails.logger.info "Harvesting and indexing research data collections has started"

    unless use_cache
      # Fetch latest community information from DataSpace
      communities = DataspaceCommunities.new
      File.write(CACHE_COMMUNITIES_FILE, JSON.pretty_generate(communities.tree))
    end

    # Harvest research data for each collection
    r = DspaceResearchDataHarvester.new
    r.collections_to_index.each do |collection|
      Rails.logger.info "Harvesting collection id #{collection.collection_id}"
      r.harvest(collection)
    end
    Rails.logger.info "Harvesting and indexing research data collections has completed"
  end

  def migration_csv_headers
    ["parent_community", "community", "collection_name", "title", "handle", "ark_url", "doi", "curator", "redescribed", "pdc_describe_id", "data_migrated"]
  end

  def item_title(item_node)
    item_node.xpath("./name").text.strip
  end

  def item_handle(item_node)
    item_node.xpath("./handle").text.strip
  end

  ##
  # Sometimes the DSpace "ParentCommunity,Community,CollectionName" has three levels of hierarchy,
  # sometimes only two. We want the top level to consistently show up as the Parent Community
  # Given an array with three elements, "ParentCommunity,Community,CollectionName",
  # if the ParentCommunity=="NA" shift everything left one space and leave CollectionName blank
  # @param [Array] three_levels
  # @return [Array]
  def csv_communities(three_levels)
    raise "Error assigning parent_community" unless three_levels.count == 3
    return three_levels if three_levels[0] != "NA"
    three_levels[0] = three_levels[1]
    three_levels[1] = three_levels[2]
    three_levels[2] = ""
    three_levels
  end

  ##
  # Given a collection_id and a file location, produce a migration spreadsheet
  def produce_migration_spreadsheet(parent_community, community, collection_name, collection_id, tracking_csv)
    url = "#{server}/collections/#{collection_id}/items"

    resp = Faraday.get(url, {}, { 'Accept': 'application/xml' })
    xml_doc = Nokogiri::XML(resp.body)

    CSV.open(tracking_csv, "a") do |csv|
      xml_doc.xpath("/items/item").each do |item_node|
        handle = item_handle(item_node)
        collection_hierarchy = csv_communities([parent_community, community, collection_name])
        everything_else = [item_title(item_node), handle, "https://dataspace.princeton.edu/handle/#{handle}", '', '', '', '', '']
        csv << collection_hierarchy + everything_else
      end
    end
  end

  ##
  # Generate a CSV with a row for each DSpace item that needs to be migrated to PDC Describe
  def produce_full_migration_spreadsheet(tracking_csv, collections_csv)
    Rails.logger.info "Generating DSpace migration tracking CSV"
    CSV.open(tracking_csv, "w") do |csv|
      csv << migration_csv_headers
    end
    collections_to_index(collections_csv).each do |collection|
      # TODO: parent community should be pushed to the left if it is NA
      produce_migration_spreadsheet(collection.parent_community, collection.community, collection.collection_name, collection.collection_id, tracking_csv)
    end
  end
end

1	# frozen_string_literal: true
2
3	require 'csv'	1✔
4
5	##
6	# Harvest research data from DataSpace for indexing
7	class DspaceResearchDataHarvester	1✔
8	COLLECTION_CONFIG = Rails.root.join('config', 'collections.csv')	1✔
9	REST_LIMIT = 100	1✔
10	CACHE_COMMUNITIES_FILE = Rails.root.join('spec', 'fixtures', 'files', 'dataspace_communities.json')	1✔
11
12	def collections_to_index(collection_config = COLLECTION_CONFIG)	1✔
13	collections = []	2✔
14	CSV.foreach(collection_config, quote_char: '"', col_sep: ',', row_sep: :auto, headers: true) do \|row\|	2✔
15	rdc = ResearchDataCollection.new(row)	68✔
16	collections << rdc	68✔
17	end
18	collections	2✔
19	end
20
21	def server	1✔
22	"#{Rails.configuration.pdc_discovery.dataspace_url}/rest"	×
23	end
24
25	##
26	# For a given ResearchDataCollection, retrieve its metadata from DataSpace
27	# @param [ResearchDataCollection] collection
28	def harvest(collection)	1✔
29	collection_id = collection.collection_id	×
30	url = "#{server}/collections/#{collection_id}/items?limit=#{REST_LIMIT}&offset=0&expand=all"	×
31
32	resp = Faraday.get(url, {}, { 'Accept': 'application/xml' })	×
33	DspaceIndexer.new(resp.body).index	×
34	end
35
36	##
37	# Convenience method to harvest and index all collections in the config file
38	# @example
39	# DspaceResearchDataHarvester.harvest
40	def self.harvest(use_cache = false)	1✔
41	Rails.logger.info "Harvesting and indexing research data collections has started"	1✔
42
43	unless use_cache	1✔
44	# Fetch latest community information from DataSpace
45	communities = DataspaceCommunities.new	×
46	File.write(CACHE_COMMUNITIES_FILE, JSON.pretty_generate(communities.tree))	×
47	end
48
49	# Harvest research data for each collection
50	r = DspaceResearchDataHarvester.new	1✔
51	r.collections_to_index.each do \|collection\|	1✔
52	Rails.logger.info "Harvesting collection id #{collection.collection_id}"	34✔
53	r.harvest(collection)	34✔
54	end
55	Rails.logger.info "Harvesting and indexing research data collections has completed"	1✔
56	end
57
58	def migration_csv_headers	1✔
59	["parent_community", "community", "collection_name", "title", "handle", "ark_url", "doi", "curator", "redescribed", "pdc_describe_id", "data_migrated"]	×
60	end
61
62	def item_title(item_node)	1✔
63	item_node.xpath("./name").text.strip	×
64	end
65
66	def item_handle(item_node)	1✔
67	item_node.xpath("./handle").text.strip	×
68	end
69
70	##
71	# Sometimes the DSpace "ParentCommunity,Community,CollectionName" has three levels of hierarchy,
72	# sometimes only two. We want the top level to consistently show up as the Parent Community
73	# Given an array with three elements, "ParentCommunity,Community,CollectionName",
74	# if the ParentCommunity=="NA" shift everything left one space and leave CollectionName blank
75	# @param [Array] three_levels
76	# @return [Array]
77	def csv_communities(three_levels)	1✔
78	raise "Error assigning parent_community" unless three_levels.count == 3	×
79	return three_levels if three_levels[0] != "NA"	×
80	three_levels[0] = three_levels[1]	×
81	three_levels[1] = three_levels[2]	×
82	three_levels[2] = ""	×
83	three_levels	×
84	end
85
86	##
87	# Given a collection_id and a file location, produce a migration spreadsheet
88	def produce_migration_spreadsheet(parent_community, community, collection_name, collection_id, tracking_csv)	1✔
89	url = "#{server}/collections/#{collection_id}/items"	×
90
91	resp = Faraday.get(url, {}, { 'Accept': 'application/xml' })	×
92	xml_doc = Nokogiri::XML(resp.body)	×
93
94	CSV.open(tracking_csv, "a") do \|csv\|	×
95	xml_doc.xpath("/items/item").each do \|item_node\|	×
96	handle = item_handle(item_node)	×
97	collection_hierarchy = csv_communities([parent_community, community, collection_name])	×
98	everything_else = [item_title(item_node), handle, "https://dataspace.princeton.edu/handle/#{handle}", '', '', '', '', '']	×
99	csv << collection_hierarchy + everything_else	×
100	end
101	end
102	end
103
104	##
105	# Generate a CSV with a row for each DSpace item that needs to be migrated to PDC Describe
106	def produce_full_migration_spreadsheet(tracking_csv, collections_csv)	1✔
107	Rails.logger.info "Generating DSpace migration tracking CSV"	×
108	CSV.open(tracking_csv, "w") do \|csv\|	×
109	csv << migration_csv_headers	×
110	end
111	collections_to_index(collections_csv).each do \|collection\|	×
112	# TODO: parent community should be pushed to the left if it is NA
113	produce_migration_spreadsheet(collection.parent_community, collection.community, collection.collection_name, collection.collection_id, tracking_csv)	×
114	end
115	end
116	end

pulibrary / pdc_discovery / 529bb1cf-d0bf-4901-97c6-12679aea8117

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous