• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

pulibrary / pdc_discovery / 529bb1cf-d0bf-4901-97c6-12679aea8117

17 Aug 2023 01:30PM UTC coverage: 90.265% (-6.3%) from 96.544%
529bb1cf-d0bf-4901-97c6-12679aea8117

Pull #478

circleci

carolyncole
Updates to css to fix tests after bundle update
Also ran rubocop -A && Prettier
Pull Request #478: Switching to selenium to fix CI

2 of 2 new or added lines in 2 files covered. (100.0%)

2142 of 2373 relevant lines covered (90.27%)

93.35 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

47.37
/app/lib/dspace_research_data_harvester.rb
1
# frozen_string_literal: true
2

3
require 'csv'
1✔
4

5
##
6
# Harvest research data from DataSpace for indexing
7
class DspaceResearchDataHarvester
1✔
8
  COLLECTION_CONFIG = Rails.root.join('config', 'collections.csv')
1✔
9
  REST_LIMIT = 100
1✔
10
  CACHE_COMMUNITIES_FILE = Rails.root.join('spec', 'fixtures', 'files', 'dataspace_communities.json')
1✔
11

12
  def collections_to_index(collection_config = COLLECTION_CONFIG)
1✔
13
    collections = []
2✔
14
    CSV.foreach(collection_config, quote_char: '"', col_sep: ',', row_sep: :auto, headers: true) do |row|
2✔
15
      rdc = ResearchDataCollection.new(row)
68✔
16
      collections << rdc
68✔
17
    end
18
    collections
2✔
19
  end
20

21
  def server
1✔
22
    "#{Rails.configuration.pdc_discovery.dataspace_url}/rest"
×
23
  end
24

25
  ##
26
  # For a given ResearchDataCollection, retrieve its metadata from DataSpace
27
  # @param [ResearchDataCollection] collection
28
  def harvest(collection)
1✔
29
    collection_id = collection.collection_id
×
30
    url = "#{server}/collections/#{collection_id}/items?limit=#{REST_LIMIT}&offset=0&expand=all"
×
31

32
    resp = Faraday.get(url, {}, { 'Accept': 'application/xml' })
×
33
    DspaceIndexer.new(resp.body).index
×
34
  end
35

36
  ##
37
  # Convenience method to harvest and index all collections in the config file
38
  # @example
39
  #   DspaceResearchDataHarvester.harvest
40
  def self.harvest(use_cache = false)
1✔
41
    Rails.logger.info "Harvesting and indexing research data collections has started"
1✔
42

43
    unless use_cache
1✔
44
      # Fetch latest community information from DataSpace
45
      communities = DataspaceCommunities.new
×
46
      File.write(CACHE_COMMUNITIES_FILE, JSON.pretty_generate(communities.tree))
×
47
    end
48

49
    # Harvest research data for each collection
50
    r = DspaceResearchDataHarvester.new
1✔
51
    r.collections_to_index.each do |collection|
1✔
52
      Rails.logger.info "Harvesting collection id #{collection.collection_id}"
34✔
53
      r.harvest(collection)
34✔
54
    end
55
    Rails.logger.info "Harvesting and indexing research data collections has completed"
1✔
56
  end
57

58
  def migration_csv_headers
1✔
59
    ["parent_community", "community", "collection_name", "title", "handle", "ark_url", "doi", "curator", "redescribed", "pdc_describe_id", "data_migrated"]
×
60
  end
61

62
  def item_title(item_node)
1✔
63
    item_node.xpath("./name").text.strip
×
64
  end
65

66
  def item_handle(item_node)
1✔
67
    item_node.xpath("./handle").text.strip
×
68
  end
69

70
  ##
71
  # Sometimes the DSpace "ParentCommunity,Community,CollectionName" has three levels of hierarchy,
72
  # sometimes only two. We want the top level to consistently show up as the Parent Community
73
  # Given an array with three elements, "ParentCommunity,Community,CollectionName",
74
  # if the ParentCommunity=="NA" shift everything left one space and leave CollectionName blank
75
  # @param [Array] three_levels
76
  # @return [Array]
77
  def csv_communities(three_levels)
1✔
78
    raise "Error assigning parent_community" unless three_levels.count == 3
×
79
    return three_levels if three_levels[0] != "NA"
×
80
    three_levels[0] = three_levels[1]
×
81
    three_levels[1] = three_levels[2]
×
82
    three_levels[2] = ""
×
83
    three_levels
×
84
  end
85

86
  ##
87
  # Given a collection_id and a file location, produce a migration spreadsheet
88
  def produce_migration_spreadsheet(parent_community, community, collection_name, collection_id, tracking_csv)
1✔
89
    url = "#{server}/collections/#{collection_id}/items"
×
90

91
    resp = Faraday.get(url, {}, { 'Accept': 'application/xml' })
×
92
    xml_doc = Nokogiri::XML(resp.body)
×
93

94
    CSV.open(tracking_csv, "a") do |csv|
×
95
      xml_doc.xpath("/items/item").each do |item_node|
×
96
        handle = item_handle(item_node)
×
97
        collection_hierarchy = csv_communities([parent_community, community, collection_name])
×
98
        everything_else = [item_title(item_node), handle, "https://dataspace.princeton.edu/handle/#{handle}", '', '', '', '', '']
×
99
        csv << collection_hierarchy + everything_else
×
100
      end
101
    end
102
  end
103

104
  ##
105
  # Generate a CSV with a row for each DSpace item that needs to be migrated to PDC Describe
106
  def produce_full_migration_spreadsheet(tracking_csv, collections_csv)
1✔
107
    Rails.logger.info "Generating DSpace migration tracking CSV"
×
108
    CSV.open(tracking_csv, "w") do |csv|
×
109
      csv << migration_csv_headers
×
110
    end
111
    collections_to_index(collections_csv).each do |collection|
×
112
      # TODO: parent community should be pushed to the left if it is NA
113
      produce_migration_spreadsheet(collection.parent_community, collection.community, collection.collection_name, collection.collection_id, tracking_csv)
×
114
    end
115
  end
116
end
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc