3293f955-3c4c-44c4-858c-794d7972b1da

# frozen_string_literal: true

require 'csv'

##
# Harvest research data from DataSpace for indexing
class DspaceResearchDataHarvester
  COLLECTION_CONFIG = Rails.root.join('config', 'collections.csv')
  REST_LIMIT = 100
  CACHE_COMMUNITIES_FILE = Rails.root.join('spec', 'fixtures', 'files', 'dataspace_communities.json')

  def collections_to_index(collection_config = COLLECTION_CONFIG)
    collections = []
    CSV.foreach(collection_config, quote_char: '"', col_sep: ',', row_sep: :auto, headers: true) do |row|
      rdc = ResearchDataCollection.new(row)
      collections << rdc
    end
    collections
  end

  def server
    "#{Rails.configuration.pdc_discovery.dataspace_url}/rest"
  end

  ##
  # For a given ResearchDataCollection, retrieve its metadata from DataSpace
  # @param [ResearchDataCollection] collection
  def harvest(collection)
    collection_id = collection.collection_id
    url = "#{server}/collections/#{collection_id}/items?limit=#{REST_LIMIT}&offset=0&expand=all"

    resp = Faraday.get(url, {}, { 'Accept': 'application/xml' })
    DspaceIndexer.new(resp.body).index
  end

  ##
  # Convenience method to harvest and index all collections in the config file
  # @example
  #   DspaceResearchDataHarvester.harvest
  def self.harvest(use_cache = false)
    Rails.logger.info "Harvesting and indexing research data collections has started"

    unless use_cache
      # Fetch latest community information from DataSpace
      communities = DataspaceCommunities.new
      File.write(CACHE_COMMUNITIES_FILE, JSON.pretty_generate(communities.tree))
    end

    # Harvest research data for each collection
    r = DspaceResearchDataHarvester.new
    r.collections_to_index.each do |collection|
      Rails.logger.info "Harvesting collection id #{collection.collection_id}"
      r.harvest(collection)
    end
    Rails.logger.info "Harvesting and indexing research data collections has completed"
  end

  def migration_csv_headers
    ["parent_community", "community", "collection_name", "title", "handle", "ark_url", "doi", "curator", "redescribed", "pdc_describe_id", "data_migrated"]
  end

  def item_title(item_node)
    item_node.xpath("./name").text.strip
  end

  def item_handle(item_node)
    item_node.xpath("./handle").text.strip
  end

  ##
  # Sometimes the DSpace "ParentCommunity,Community,CollectionName" has three levels of hierarchy,
  # sometimes only two. We want the top level to consistently show up as the Parent Community
  # Given an array with three elements, "ParentCommunity,Community,CollectionName",
  # if the ParentCommunity=="NA" shift everything left one space and leave CollectionName blank
  # @param [Array] three_levels
  # @return [Array]
  def csv_communities(three_levels)
    raise "Error assigning parent_community" unless three_levels.count == 3
    return three_levels if three_levels[0] != "NA"
    three_levels[0] = three_levels[1]
    three_levels[1] = three_levels[2]
    three_levels[2] = ""
    three_levels
  end

  ##
  # Given a collection_id and a file location, produce a migration spreadsheet
  def produce_migration_spreadsheet(parent_community, community, collection_name, collection_id, tracking_csv)
    url = "#{server}/collections/#{collection_id}/items"

    resp = Faraday.get(url, {}, { 'Accept': 'application/xml' })
    xml_doc = Nokogiri::XML(resp.body)

    CSV.open(tracking_csv, "a") do |csv|
      xml_doc.xpath("/items/item").each do |item_node|
        handle = item_handle(item_node)
        collection_hierarchy = csv_communities([parent_community, community, collection_name])
        everything_else = [item_title(item_node), handle, "https://dataspace.princeton.edu/handle/#{handle}", '', '', '', '', '']
        csv << collection_hierarchy + everything_else
      end
    end
  end

  ##
  # Delta between migration needed and what's on the in progress spreadsheet
  def delta_migration(parent_community, community, collection_name, collection_id, tracking_csv, in_progress_csv)
    in_progress_data = CSV.parse(File.read(in_progress_csv), headers: true)
    in_progress_handles = in_progress_data.by_col["handle"]
    url = "#{server}/collections/#{collection_id}/items"

    resp = Faraday.get(url, {}, { 'Accept': 'application/xml' })
    xml_doc = Nokogiri::XML(resp.body)

    CSV.open(tracking_csv, "a") do |csv|
      xml_doc.xpath("/items/item").each do |item_node|
        handle = item_handle(item_node)
        next if in_progress_handles.include?(handle)
        collection_hierarchy = csv_communities([parent_community, community, collection_name])
        everything_else = [item_title(item_node), handle, "https://dataspace.princeton.edu/handle/#{handle}", '', '', '', '', '']
        csv << collection_hierarchy + everything_else
      end
    end
  end

  ##
  # Generate a CSV with a row for each DSpace item that needs to be migrated to PDC Describe
  def produce_full_migration_spreadsheet(tracking_csv, collections_csv)
    Rails.logger.info "Generating DSpace migration tracking CSV"
    CSV.open(tracking_csv, "w") do |csv|
      csv << migration_csv_headers
    end
    collections_to_index(collections_csv).each do |collection|
      # TODO: parent community should be pushed to the left if it is NA
      produce_migration_spreadsheet(collection.parent_community, collection.community, collection.collection_name, collection.collection_id, tracking_csv)
    end
  end

  # Generate a CSV with a row for each DSpace item that needs to be migrated
  # and is not yet present in the in_progress spreadsheet
  def produce_delta_migration_spreadsheet(tracking_csv, collections_csv, in_progress_csv)
    Rails.logger.info "Generating DSpace DELTA migration tracking CSV"
    Rails.logger.info "Calculating delta against #{in_progress_csv}"
    CSV.open(tracking_csv, "w") do |csv|
      csv << migration_csv_headers
    end
    collections_to_index(collections_csv).each do |collection|
      delta_migration(collection.parent_community, collection.community, collection.collection_name, collection.collection_id, tracking_csv, in_progress_csv)
    end
  end
end

1	# frozen_string_literal: true
2
3	require 'csv'	1✔
4
5	##
6	# Harvest research data from DataSpace for indexing
7	class DspaceResearchDataHarvester	1✔
8	COLLECTION_CONFIG = Rails.root.join('config', 'collections.csv')	1✔
9	REST_LIMIT = 100	1✔
10	CACHE_COMMUNITIES_FILE = Rails.root.join('spec', 'fixtures', 'files', 'dataspace_communities.json')	1✔
11
12	def collections_to_index(collection_config = COLLECTION_CONFIG)	1✔
13	collections = []	5✔
14	CSV.foreach(collection_config, quote_char: '"', col_sep: ',', row_sep: :auto, headers: true) do \|row\|	5✔
15	rdc = ResearchDataCollection.new(row)	109✔
16	collections << rdc	109✔
17	end
18	collections	5✔
19	end
20
21	def server	1✔
22	"#{Rails.configuration.pdc_discovery.dataspace_url}/rest"	1✔
23	end
24
25	##
26	# For a given ResearchDataCollection, retrieve its metadata from DataSpace
27	# @param [ResearchDataCollection] collection
28	def harvest(collection)	1✔
29	collection_id = collection.collection_id	1✔
30	url = "#{server}/collections/#{collection_id}/items?limit=#{REST_LIMIT}&offset=0&expand=all"	1✔
31
32	resp = Faraday.get(url, {}, { 'Accept': 'application/xml' })	1✔
33	DspaceIndexer.new(resp.body).index	1✔
34	end
35
36	##
37	# Convenience method to harvest and index all collections in the config file
38	# @example
39	# DspaceResearchDataHarvester.harvest
40	def self.harvest(use_cache = false)	1✔
41	Rails.logger.info "Harvesting and indexing research data collections has started"	1✔
42
43	unless use_cache	1✔
44	# Fetch latest community information from DataSpace
45	communities = DataspaceCommunities.new	×
46	File.write(CACHE_COMMUNITIES_FILE, JSON.pretty_generate(communities.tree))	×
47	end
48
49	# Harvest research data for each collection
50	r = DspaceResearchDataHarvester.new	1✔
51	r.collections_to_index.each do \|collection\|	1✔
52	Rails.logger.info "Harvesting collection id #{collection.collection_id}"	35✔
53	r.harvest(collection)	35✔
54	end
55	Rails.logger.info "Harvesting and indexing research data collections has completed"	×
56	end
57
58	def migration_csv_headers	1✔
59	["parent_community", "community", "collection_name", "title", "handle", "ark_url", "doi", "curator", "redescribed", "pdc_describe_id", "data_migrated"]	3✔
60	end
61
62	def item_title(item_node)	1✔
63	item_node.xpath("./name").text.strip	17✔
64	end
65
66	def item_handle(item_node)	1✔
67	item_node.xpath("./handle").text.strip	25✔
68	end
69
70	##
71	# Sometimes the DSpace "ParentCommunity,Community,CollectionName" has three levels of hierarchy,
72	# sometimes only two. We want the top level to consistently show up as the Parent Community
73	# Given an array with three elements, "ParentCommunity,Community,CollectionName",
74	# if the ParentCommunity=="NA" shift everything left one space and leave CollectionName blank
75	# @param [Array] three_levels
76	# @return [Array]
77	def csv_communities(three_levels)	1✔
78	raise "Error assigning parent_community" unless three_levels.count == 3	19✔
79	return three_levels if three_levels[0] != "NA"	19✔
80	three_levels[0] = three_levels[1]	1✔
81	three_levels[1] = three_levels[2]	1✔
82	three_levels[2] = ""	1✔
83	three_levels	1✔
84	end
85
86	##
87	# Given a collection_id and a file location, produce a migration spreadsheet
88	def produce_migration_spreadsheet(parent_community, community, collection_name, collection_id, tracking_csv)	1✔
89	url = "#{server}/collections/#{collection_id}/items"	3✔
90
91	resp = Faraday.get(url, {}, { 'Accept': 'application/xml' })	3✔
92	xml_doc = Nokogiri::XML(resp.body)	3✔
93
94	CSV.open(tracking_csv, "a") do \|csv\|	3✔
95	xml_doc.xpath("/items/item").each do \|item_node\|	3✔
96	handle = item_handle(item_node)	16✔
97	collection_hierarchy = csv_communities([parent_community, community, collection_name])	16✔
98	everything_else = [item_title(item_node), handle, "https://dataspace.princeton.edu/handle/#{handle}", '', '', '', '', '']	16✔
99	csv << collection_hierarchy + everything_else	16✔
100	end
101	end
102	end
103
104	##
105	# Delta between migration needed and what's on the in progress spreadsheet
106	def delta_migration(parent_community, community, collection_name, collection_id, tracking_csv, in_progress_csv)	1✔
107	in_progress_data = CSV.parse(File.read(in_progress_csv), headers: true)	2✔
108	in_progress_handles = in_progress_data.by_col["handle"]	2✔
109	url = "#{server}/collections/#{collection_id}/items"	2✔
110
111	resp = Faraday.get(url, {}, { 'Accept': 'application/xml' })	2✔
112	xml_doc = Nokogiri::XML(resp.body)	2✔
113
114	CSV.open(tracking_csv, "a") do \|csv\|	2✔
115	xml_doc.xpath("/items/item").each do \|item_node\|	2✔
116	handle = item_handle(item_node)	9✔
117	next if in_progress_handles.include?(handle)	9✔
118	collection_hierarchy = csv_communities([parent_community, community, collection_name])	1✔
119	everything_else = [item_title(item_node), handle, "https://dataspace.princeton.edu/handle/#{handle}", '', '', '', '', '']	1✔
120	csv << collection_hierarchy + everything_else	1✔
121	end
122	end
123	end
124
125	##
126	# Generate a CSV with a row for each DSpace item that needs to be migrated to PDC Describe
127	def produce_full_migration_spreadsheet(tracking_csv, collections_csv)	1✔
128	Rails.logger.info "Generating DSpace migration tracking CSV"	1✔
129	CSV.open(tracking_csv, "w") do \|csv\|	1✔
130	csv << migration_csv_headers	1✔
131	end
132	collections_to_index(collections_csv).each do \|collection\|	1✔
133	# TODO: parent community should be pushed to the left if it is NA
134	produce_migration_spreadsheet(collection.parent_community, collection.community, collection.collection_name, collection.collection_id, tracking_csv)	2✔
135	end
136	end
137
138	# Generate a CSV with a row for each DSpace item that needs to be migrated
139	# and is not yet present in the in_progress spreadsheet
140	def produce_delta_migration_spreadsheet(tracking_csv, collections_csv, in_progress_csv)	1✔
141	Rails.logger.info "Generating DSpace DELTA migration tracking CSV"	1✔
142	Rails.logger.info "Calculating delta against #{in_progress_csv}"	1✔
143	CSV.open(tracking_csv, "w") do \|csv\|	1✔
144	csv << migration_csv_headers	1✔
145	end
146	collections_to_index(collections_csv).each do \|collection\|	1✔
147	delta_migration(collection.parent_community, collection.community, collection.collection_name, collection.collection_id, tracking_csv, in_progress_csv)	2✔
148	end
149	end
150	end

pulibrary / pdc_discovery / 3293f955-3c4c-44c4-858c-794d7972b1da

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous