• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

pulibrary / pdc_discovery / 3293f955-3c4c-44c4-858c-794d7972b1da

22 Aug 2023 08:46PM UTC coverage: 96.546% (-0.003%) from 96.549%
3293f955-3c4c-44c4-858c-794d7972b1da

Pull #489

circleci

bess
Add psychology to collections list
Pull Request #489: Update migration spreadsheet

28 of 28 new or added lines in 2 files covered. (100.0%)

2348 of 2432 relevant lines covered (96.55%)

182.11 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

96.1
/app/lib/dspace_research_data_harvester.rb
1
# frozen_string_literal: true
2

3
require 'csv'
1✔
4

5
##
6
# Harvest research data from DataSpace for indexing
7
class DspaceResearchDataHarvester
1✔
8
  COLLECTION_CONFIG = Rails.root.join('config', 'collections.csv')
1✔
9
  REST_LIMIT = 100
1✔
10
  CACHE_COMMUNITIES_FILE = Rails.root.join('spec', 'fixtures', 'files', 'dataspace_communities.json')
1✔
11

12
  def collections_to_index(collection_config = COLLECTION_CONFIG)
1✔
13
    collections = []
5✔
14
    CSV.foreach(collection_config, quote_char: '"', col_sep: ',', row_sep: :auto, headers: true) do |row|
5✔
15
      rdc = ResearchDataCollection.new(row)
109✔
16
      collections << rdc
109✔
17
    end
18
    collections
5✔
19
  end
20

21
  def server
1✔
22
    "#{Rails.configuration.pdc_discovery.dataspace_url}/rest"
1✔
23
  end
24

25
  ##
26
  # For a given ResearchDataCollection, retrieve its metadata from DataSpace
27
  # @param [ResearchDataCollection] collection
28
  def harvest(collection)
1✔
29
    collection_id = collection.collection_id
1✔
30
    url = "#{server}/collections/#{collection_id}/items?limit=#{REST_LIMIT}&offset=0&expand=all"
1✔
31

32
    resp = Faraday.get(url, {}, { 'Accept': 'application/xml' })
1✔
33
    DspaceIndexer.new(resp.body).index
1✔
34
  end
35

36
  ##
37
  # Convenience method to harvest and index all collections in the config file
38
  # @example
39
  #   DspaceResearchDataHarvester.harvest
40
  def self.harvest(use_cache = false)
1✔
41
    Rails.logger.info "Harvesting and indexing research data collections has started"
1✔
42

43
    unless use_cache
1✔
44
      # Fetch latest community information from DataSpace
45
      communities = DataspaceCommunities.new
×
46
      File.write(CACHE_COMMUNITIES_FILE, JSON.pretty_generate(communities.tree))
×
47
    end
48

49
    # Harvest research data for each collection
50
    r = DspaceResearchDataHarvester.new
1✔
51
    r.collections_to_index.each do |collection|
1✔
52
      Rails.logger.info "Harvesting collection id #{collection.collection_id}"
35✔
53
      r.harvest(collection)
35✔
54
    end
55
    Rails.logger.info "Harvesting and indexing research data collections has completed"
×
56
  end
57

58
  def migration_csv_headers
1✔
59
    ["parent_community", "community", "collection_name", "title", "handle", "ark_url", "doi", "curator", "redescribed", "pdc_describe_id", "data_migrated"]
3✔
60
  end
61

62
  def item_title(item_node)
1✔
63
    item_node.xpath("./name").text.strip
17✔
64
  end
65

66
  def item_handle(item_node)
1✔
67
    item_node.xpath("./handle").text.strip
25✔
68
  end
69

70
  ##
71
  # Sometimes the DSpace "ParentCommunity,Community,CollectionName" has three levels of hierarchy,
72
  # sometimes only two. We want the top level to consistently show up as the Parent Community
73
  # Given an array with three elements, "ParentCommunity,Community,CollectionName",
74
  # if the ParentCommunity=="NA" shift everything left one space and leave CollectionName blank
75
  # @param [Array] three_levels
76
  # @return [Array]
77
  def csv_communities(three_levels)
1✔
78
    raise "Error assigning parent_community" unless three_levels.count == 3
19✔
79
    return three_levels if three_levels[0] != "NA"
19✔
80
    three_levels[0] = three_levels[1]
1✔
81
    three_levels[1] = three_levels[2]
1✔
82
    three_levels[2] = ""
1✔
83
    three_levels
1✔
84
  end
85

86
  ##
87
  # Given a collection_id and a file location, produce a migration spreadsheet
88
  def produce_migration_spreadsheet(parent_community, community, collection_name, collection_id, tracking_csv)
1✔
89
    url = "#{server}/collections/#{collection_id}/items"
3✔
90

91
    resp = Faraday.get(url, {}, { 'Accept': 'application/xml' })
3✔
92
    xml_doc = Nokogiri::XML(resp.body)
3✔
93

94
    CSV.open(tracking_csv, "a") do |csv|
3✔
95
      xml_doc.xpath("/items/item").each do |item_node|
3✔
96
        handle = item_handle(item_node)
16✔
97
        collection_hierarchy = csv_communities([parent_community, community, collection_name])
16✔
98
        everything_else = [item_title(item_node), handle, "https://dataspace.princeton.edu/handle/#{handle}", '', '', '', '', '']
16✔
99
        csv << collection_hierarchy + everything_else
16✔
100
      end
101
    end
102
  end
103

104
  ##
105
  # Delta between migration needed and what's on the in progress spreadsheet
106
  def delta_migration(parent_community, community, collection_name, collection_id, tracking_csv, in_progress_csv)
1✔
107
    in_progress_data = CSV.parse(File.read(in_progress_csv), headers: true)
2✔
108
    in_progress_handles = in_progress_data.by_col["handle"]
2✔
109
    url = "#{server}/collections/#{collection_id}/items"
2✔
110

111
    resp = Faraday.get(url, {}, { 'Accept': 'application/xml' })
2✔
112
    xml_doc = Nokogiri::XML(resp.body)
2✔
113

114
    CSV.open(tracking_csv, "a") do |csv|
2✔
115
      xml_doc.xpath("/items/item").each do |item_node|
2✔
116
        handle = item_handle(item_node)
9✔
117
        next if in_progress_handles.include?(handle)
9✔
118
        collection_hierarchy = csv_communities([parent_community, community, collection_name])
1✔
119
        everything_else = [item_title(item_node), handle, "https://dataspace.princeton.edu/handle/#{handle}", '', '', '', '', '']
1✔
120
        csv << collection_hierarchy + everything_else
1✔
121
      end
122
    end
123
  end
124

125
  ##
126
  # Generate a CSV with a row for each DSpace item that needs to be migrated to PDC Describe
127
  def produce_full_migration_spreadsheet(tracking_csv, collections_csv)
1✔
128
    Rails.logger.info "Generating DSpace migration tracking CSV"
1✔
129
    CSV.open(tracking_csv, "w") do |csv|
1✔
130
      csv << migration_csv_headers
1✔
131
    end
132
    collections_to_index(collections_csv).each do |collection|
1✔
133
      # TODO: parent community should be pushed to the left if it is NA
134
      produce_migration_spreadsheet(collection.parent_community, collection.community, collection.collection_name, collection.collection_id, tracking_csv)
2✔
135
    end
136
  end
137

138
  # Generate a CSV with a row for each DSpace item that needs to be migrated
139
  # and is not yet present in the in_progress spreadsheet
140
  def produce_delta_migration_spreadsheet(tracking_csv, collections_csv, in_progress_csv)
1✔
141
    Rails.logger.info "Generating DSpace DELTA migration tracking CSV"
1✔
142
    Rails.logger.info "Calculating delta against #{in_progress_csv}"
1✔
143
    CSV.open(tracking_csv, "w") do |csv|
1✔
144
      csv << migration_csv_headers
1✔
145
    end
146
    collections_to_index(collections_csv).each do |collection|
1✔
147
      delta_migration(collection.parent_community, collection.community, collection.collection_name, collection.collection_id, tracking_csv, in_progress_csv)
2✔
148
    end
149
  end
150
end
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc