• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

pulibrary / lib_jobs / 11b19a82-e82c-4c79-ad29-8b9658d7ac88

08 Oct 2025 02:47PM UTC coverage: 96.319% (-0.04%) from 96.356%
11b19a82-e82c-4c79-ad29-8b9658d7ac88

Pull #973

circleci

christinach
Add a job to get family, person and corporate entities agents

related to [#964]
Pull Request #973: Add a job to get family, person and corporate entities agents

123 of 129 new or added lines in 1 file covered. (95.35%)

3349 of 3477 relevant lines covered (96.32%)

1221.54 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

95.35
/app/models/aspace_version_control/get_agents_job.rb
1
# frozen_string_literal: true
2
require 'archivesspace/client'
1✔
3
require 'nokogiri'
1✔
4

5
module AspaceVersionControl
1✔
6
  # rubocop:disable Metrics/ClassLength
7
  class GetAgentsJob < LibJob
1✔
8
    def initialize
1✔
9
      super(category: "Agents_export")
30✔
10
      @errors = []
30✔
11
    end
12

13
    def aspace_login
1✔
14
      aspace_client(aspace_config)
2✔
15
    end
16

17
    def aspace_config
1✔
18
      raise "Missing required environment variables: ASPACE_URL, ASPACE_USER, or ASPACE_PASSWORD" unless ENV['ASPACE_URL'] && ENV['ASPACE_USER'] && ENV['ASPACE_PASSWORD']
4✔
19

20
      @config ||= ArchivesSpace::Configuration.new({
3✔
21
                                                     base_uri: ENV['ASPACE_URL'],
22
                                                     username: ENV['ASPACE_USER'],
23
                                                     password: ENV['ASPACE_PASSWORD'],
24
                                                     throttle: 0,
25
                                                     verify_ssl: false
26
                                                   })
27
    end
28

29
    def aspace_client(config)
1✔
30
      @client ||= ArchivesSpace::Client.new(config).login
2✔
31
    rescue => error
32
      Rails.logger.error("Failed to connect to ArchivesSpace: #{error.message}")
1✔
33
      @errors << "ArchivesSpace connection failed: #{error.message}"
1✔
34
      raise error
1✔
35
    end
36

37
    def handle(data_set:)
1✔
38
      aspace_login
1✔
39
      # TODO:  agent job
40
      data_set.data = report
1✔
41
      data_set.report_time = Time.zone.now
1✔
42
      data_set
1✔
43
    end
44

45
    def report
1✔
46
      if @errors.empty?
3✔
47
        "Agents successfully exported."
2✔
48
      else
49
        @errors.join(', ')
1✔
50
      end
51
    end
52

53
    def list_family_agents
1✔
54
      @family_agents ||= @client.get("agents/families", {
1✔
55
                                       query: { all_ids: true }
56
                                     }).parsed
57
    end
58

59
    def list_corporate_entities_agents
1✔
60
      @corporate_entities_agents ||= @client.get('agents/corporate_entities', {
1✔
61
                                                   query: { all_ids: true }
62
                                                 }).parsed
63
    end
64

65
    def list_person_agents
1✔
66
      @person_agents ||= @client.get('agents/people', {
5✔
67
                                       query: { all_ids: true }
68
                                     }).parsed
69
    end
70

71
    def get_archival_context_xml(id, agent_type)
1✔
72
      # repository 1 is the global repository in aspace that has all the agents
73
      xml_str_body = @client.get("/repositories/1/archival_contexts/#{agent_type}/#{id}.xml").body
15✔
74
      doc = Nokogiri::XML(xml_str_body)
15✔
75

76
      filename_base = generate_cpf_filename(doc, id, agent_type)
15✔
77

78
      {
79
        xml_content: doc.to_xml,
15✔
80
        filename: "#{filename_base}.CPF.xml"
81
      }
82
    end
83

84
    # Methods to use separately if we want to schedule them individually
85
    # or run them ad hoc
86

87
    def get_person_archival_context_xml(id)
1✔
88
      get_archival_context_xml(id, 'people')
1✔
89
    end
90

91
    def get_family_archival_context_xml(id)
1✔
92
      get_archival_context_xml(id, 'families')
1✔
93
    end
94

95
    def get_corporate_entity_archival_context_xml(id)
1✔
96
      get_archival_context_xml(id, 'corporate_entities')
1✔
97
    end
98

99
    def write_cpf_to_file(dir, id, agent_type)
1✔
100
      Rails.logger.info("Processing CPF for #{agent_type}/#{id}")
14✔
101

102
      cpf_data = get_archival_context_xml(id, agent_type)
14✔
103
      filename = "#{dir}/#{cpf_data[:filename]}"
13✔
104

105
      File.open(filename, "w") do |file|
13✔
106
        file << cpf_data[:xml_content]
13✔
107
      end
108

109
      Rails.logger.info("Wrote CPF XML to #{filename}")
13✔
110
      cpf_data[:filename]
13✔
111
    rescue => error
112
      err = "Unable to process CPF for #{agent_type}/#{id}: #{error.message}"
1✔
113
      log_stdout(err)
1✔
114
      log_stderr(err)
1✔
115
    end
116

117
    # rubocop:disable Metrics/MethodLength
118
    def process_all_cpf_files(agent_type, output_dir, chunk_size: 500, start_from: 0)
1✔
119
      agent_ids = get_agent_ids_by_type(agent_type)
5✔
120
      total_count = agent_ids.count
4✔
121

122
      Rails.logger.info("Processing #{total_count} #{agent_type} agents for CPF files (starting from #{start_from})")
4✔
123
      FileUtils.mkdir_p(output_dir)
4✔
124

125
      checkpoint_file = "#{output_dir}/#{agent_type}_checkpoint.txt"
4✔
126
      start_from = resolve_start_position(checkpoint_file, start_from)
4✔
127

128
      config = {
129
        output_dir: output_dir,
4✔
130
        start_from: start_from,
131
        total_count: total_count,
132
        chunk_size: chunk_size,
133
        checkpoint_file: checkpoint_file
134
      }
135
      processed_count = process_agent_batch(agent_ids, agent_type, config)
4✔
136

137
      cleanup_checkpoint(checkpoint_file)
4✔
138
      Rails.logger.info("Completed processing: #{processed_count} #{agent_type} agents successful out of #{total_count - start_from} attempted")
4✔
139
      processed_count
4✔
140
    end
141
    # enable Metrics/MethodLength
142

143
    # Methods for each agent type in case we want to run them separately
144
    def process_all_person_cpf_files(output_dir, chunk_size: 500, start_from: 0)
1✔
145
      process_all_cpf_files('people', output_dir, chunk_size: chunk_size, start_from: start_from)
1✔
146
    end
147

148
    def process_all_family_cpf_files(output_dir, chunk_size: 500, start_from: 0)
1✔
149
      process_all_cpf_files('families', output_dir, chunk_size: chunk_size, start_from: start_from)
1✔
150
    end
151

152
    def process_all_corporate_entity_cpf_files(output_dir, chunk_size: 500, start_from: 0)
1✔
153
      process_all_cpf_files('corporate_entities', output_dir, chunk_size: chunk_size, start_from: start_from)
1✔
154
    end
155

156
    private
1✔
157

158
    def get_agent_ids_by_type(agent_type)
1✔
159
      case agent_type
5✔
160
      when 'people'
161
        list_person_agents
4✔
162
      when 'families'
NEW
163
        list_family_agents
×
164
      when 'corporate_entities'
NEW
165
        list_corporate_entities_agents
×
166
      else
167
        raise "Unknown agent type: #{agent_type}"
1✔
168
      end
169
    end
170

171
    def resolve_start_position(checkpoint_file, start_from)
1✔
172
      if File.exist?(checkpoint_file) && start_from.zero?
4✔
173
        checkpoint_position = File.read(checkpoint_file).to_i
1✔
174
        Rails.logger.info("Resuming from checkpoint: #{checkpoint_position}")
1✔
175
        checkpoint_position
1✔
176
      else
177
        start_from
3✔
178
      end
179
    end
180

181
    def process_agent_batch(agent_ids, agent_type, config)
1✔
182
      processed_count = 0
4✔
183
      output_dir = config[:output_dir]
4✔
184
      start_from = config[:start_from]
4✔
185
      total_count = config[:total_count]
4✔
186
      chunk_size = config[:chunk_size]
4✔
187
      checkpoint_file = config[:checkpoint_file]
4✔
188

189
      agent_ids.drop(start_from).each_with_index do |id, relative_index|
4✔
190
        absolute_index = start_from + relative_index
11✔
191

192
        processed_count += 1 if process_single_agent(id, agent_type, output_dir, absolute_index)
11✔
193

194
        handle_progress_and_checkpoints(absolute_index, total_count, agent_type, chunk_size, checkpoint_file)
11✔
195
      end
196

197
      processed_count
4✔
198
    end
199

200
    def process_single_agent(id, agent_type, output_dir, absolute_index)
1✔
201
      write_cpf_to_file(output_dir, id, agent_type)
11✔
202
      true
11✔
203
    rescue => error
NEW
204
      Rails.logger.error("Failed to process #{agent_type} agent #{id} at index #{absolute_index}: #{error.message}")
×
NEW
205
      @errors << "#{agent_type} agent #{id}: #{error.message}"
×
NEW
206
      false
×
207
    end
208

209
    def handle_progress_and_checkpoints(absolute_index, total_count, agent_type, chunk_size, checkpoint_file)
1✔
210
      Rails.logger.info("Processed #{absolute_index + 1}/#{total_count} #{agent_type} agents") if ((absolute_index + 1) % 100).zero?
11✔
211

212
      # Save checkpoint every chunk_size records
213
      if ((absolute_index + 1) % chunk_size).zero?
11✔
214
        File.write(checkpoint_file, absolute_index + 1)
4✔
215
        Rails.logger.info("Checkpoint saved at #{absolute_index + 1}")
4✔
216
        sleep(2)
4✔
217
      elsif ((absolute_index + 1) % 20).zero?
7✔
NEW
218
        sleep(0.05) # don't overwhelm the API with requests
×
219
      end
220
    end
221

222
    def cleanup_checkpoint(checkpoint_file)
1✔
223
      File.delete(checkpoint_file) if File.exist?(checkpoint_file)
4✔
224
    end
225

226
    def generate_cpf_filename(doc, id, agent_type)
1✔
227
      namespace = { 'eac' => 'urn:isbn:1-931666-33-4' }
18✔
228

229
      surname = doc.at_xpath('//eac:nameEntry/eac:part[@localType="surname"]', namespace)&.text
18✔
230
      forename = doc.at_xpath('//eac:nameEntry/eac:part[@localType="forename"]', namespace)&.text
18✔
231

232
      # Build name parts if they exist
233
      name_parts = []
18✔
234
      name_parts << surname.gsub(/\s+/, '').upcase if surname
18✔
235
      name_parts << forename.gsub(/\s+/, '').upcase if forename
18✔
236

237
      # Concatenate name parts, agent_type, and agent_id
238
      filename_parts = []
18✔
239
      filename_parts << name_parts.join('_') if name_parts.any?
18✔
240
      filename_parts << agent_type
18✔
241
      filename_parts << id.to_s
18✔
242

243
      filename_parts.join('_')
18✔
244
    end
245

246
    def log_stderr(stderr_str)
1✔
247
      @errors << stderr_str unless stderr_str.empty?
1✔
248
    end
249

250
    def log_stdout(stdout_str)
1✔
251
      Rails.logger.info(stdout_str) unless stdout_str.empty?
1✔
252
    end
253
  end
254
  # rubocop:enable Metrics/ClassLength
255
end
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc