• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

pulibrary / pdc_describe / 9091a1ae-29be-458c-984a-339d213919c4

12 Dec 2024 07:41PM UTC coverage: 26.434% (-69.7%) from 96.113%
9091a1ae-29be-458c-984a-339d213919c4

Pull #2000

circleci

jrgriffiniii
Removing integration with ActiveStorage
Pull Request #2000: Bump actionpack from 7.2.1.1 to 7.2.2.1

945 of 3575 relevant lines covered (26.43%)

0.35 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

21.88
/app/services/s3_query_service.rb
1
# frozen_string_literal: true
2

3
require "aws-sdk-s3"
1✔
4

5
# A service to query an S3 bucket for information about a given data set
6
# rubocop:disable Metrics/ClassLength
7
class S3QueryService
1✔
8
  attr_reader :model
1✔
9

10
  PRECURATION = "precuration"
1✔
11
  POSTCURATION = "postcuration"
1✔
12
  PRESERVATION = "preservation"
1✔
13

14
  def self.configuration
1✔
15
    Rails.configuration.s3
×
16
  end
17

18
  def self.pre_curation_config
1✔
19
    configuration.pre_curation
×
20
  end
21

22
  def self.post_curation_config
1✔
23
    configuration.post_curation
×
24
  end
25

26
  def self.preservation_config
1✔
27
    configuration.preservation
×
28
  end
29

30
  attr_reader :part_size, :last_response
1✔
31

32
  ##
33
  # @param [Work] model
34
  # @param [String] mode Valid values are "precuration", "postcuration", "preservation".
35
  #                      This value controlls the AWS S3 bucket used to access the files.
36
  # @example S3QueryService.new(Work.find(1), "precuration")
37
  def initialize(model, mode = "precuration")
1✔
38
    @model = model
×
39
    @doi = model.doi
×
40
    @mode = mode
×
41
    @part_size = 5_368_709_120 # 5GB is the maximum part size for AWS
×
42
    @last_response = nil
×
43
    @s3_responses = {}
×
44
  end
45

46
  def config
1✔
47
    if @mode == PRESERVATION
×
48
      self.class.preservation_config
×
49
    elsif @mode == POSTCURATION
×
50
      self.class.post_curation_config
×
51
    elsif @mode == PRECURATION
×
52
      self.class.pre_curation_config
×
53
    else
54
      raise ArgumentError, "Invalid mode value: #{@mode}"
×
55
    end
56
  end
57

58
  def pre_curation?
1✔
59
    @mode == PRECURATION
×
60
  end
61

62
  def post_curation?
1✔
63
    @mode == POSTCURATION
×
64
  end
65

66
  ##
67
  # The name of the bucket this class is configured to use.
68
  # See config/s3.yml for configuration file.
69
  def bucket_name
1✔
70
    config.fetch(:bucket, nil)
×
71
  end
72

73
  def region
1✔
74
    config.fetch(:region, nil)
×
75
  end
76

77
  ##
78
  # The S3 prefix for this object, i.e., the address within the S3 bucket,
79
  # which is based on the DOI
80
  def prefix
1✔
81
    "#{@doi}/#{model.id}/"
×
82
  end
83

84
  ##
85
  # Construct an S3 address for this data set
86
  def s3_address
1✔
87
    "s3://#{bucket_name}/#{prefix}"
×
88
  end
89

90
  ##
91
  # Public signed URL to fetch this file from the S3 (valid for a limited time)
92
  def file_url(key)
1✔
93
    signer = Aws::S3::Presigner.new(client:)
×
94
    signer.presigned_url(:get_object, bucket: bucket_name, key:)
×
95
  end
96

97
  def access_key_id
1✔
98
    S3QueryService.configuration["access_key_id"]
×
99
  end
100

101
  def secret_access_key
1✔
102
    S3QueryService.configuration["secret_access_key"]
×
103
  end
104

105
  def credentials
1✔
106
    @credentials ||= Aws::Credentials.new(access_key_id, secret_access_key)
×
107
  end
108

109
  def client
1✔
110
    @client ||= Aws::S3::Client.new(region:, credentials:)
×
111
  end
112

113
  # required, accepts ETag, Checksum, ObjectParts, StorageClass, ObjectSize
114
  def self.object_attributes
1✔
115
    [
×
116
      "ETag",
117
      "Checksum",
118
      "ObjectParts",
119
      "StorageClass",
120
      "ObjectSize"
121
    ]
122
  end
123

124
  def get_s3_object_attributes(key:)
1✔
125
    response = client.get_object_attributes({
×
126
                                              bucket: bucket_name,
127
                                              key:,
128
                                              object_attributes: self.class.object_attributes
129
                                            })
130
    response.to_h
×
131
  end
132

133
  def get_s3_object(key:)
1✔
134
    response = client.get_object({
×
135
                                   bucket: bucket_name,
136
                                   key:
137
                                 })
138
    object = response.to_h
×
139
    return if object.empty?
×
140

141
    object
×
142
  rescue Aws::Errors::ServiceError => aws_service_error
143
    message = "An error was encountered when requesting the AWS S3 Object #{key}: #{aws_service_error}"
×
144
    Rails.logger.error(message)
×
145
    raise aws_service_error
×
146
  end
147

148
  def build_s3_object_key(filename:)
1✔
149
    "#{prefix}#{filename}"
×
150
  end
151

152
  def find_s3_file(filename:)
1✔
153
    s3_object_key = build_s3_object_key(filename:)
×
154

155
    object = get_s3_object_attributes(key: s3_object_key)
×
156
    return if object.nil?
×
157

158
    S3File.new(work: model, filename: s3_object_key, last_modified: object[:last_modified], size: object[:object_size], checksum: object[:etag])
×
159
  end
160

161
  # Retrieve the S3 resources uploaded to the S3 Bucket
162
  # @return [Array<S3File>]
163
  def client_s3_files(reload: false, bucket_name: self.bucket_name, prefix: self.prefix)
1✔
164
    if reload # force a reload
×
165
      @client_s3_files = nil
×
166
      clear_s3_responses(bucket_name:, prefix:)
×
167
    end
168
    @client_s3_files ||= get_s3_objects(bucket_name:, prefix:)
×
169
  end
170

171
  def client_s3_empty_files(reload: false, bucket_name: self.bucket_name, prefix: self.prefix)
1✔
172
    if reload # force a reload
×
173
      @client_s3_empty_files = nil
×
174
      clear_s3_responses(bucket_name:, prefix:)
×
175
    end
176
    @client_s3_empty_files ||= begin
×
177
      files_and_directories = get_s3_objects(bucket_name:, prefix:)
×
178
      files_and_directories.select(&:empty?)
×
179
    end
180
  end
181

182
  def file_count
1✔
183
    client_s3_files.count
×
184
  rescue Aws::Errors::ServiceError => aws_service_error
185
    message = "An error was encountered when requesting AWS S3 Objects from the bucket #{bucket_name} with the prefix #{prefix}: #{aws_service_error}"
×
186
    Rails.logger.error(message)
×
187
    raise aws_service_error
×
188
  end
189

190
  ##
191
  # Query the S3 bucket for what we know about the doi
192
  # For docs see:
193
  # * https://docs.aws.amazon.com/sdk-for-ruby/v3/api/Aws/S3/Client.html#list_objects_v2-instance_method
194
  # * https://docs.aws.amazon.com/sdk-for-ruby/v3/api/Aws/S3/Client.html#get_object_attributes-instance_method
195
  # @return Hash with two properties {objects: [<S3File>], ok: Bool}
196
  #   objects is an Array of S3File objects
197
  #   ok is false if there is an error connecting to S3. Otherwise true.
198
  def data_profile
1✔
199
    { objects: client_s3_files, ok: true }
×
200
  rescue => ex
201
    Rails.logger.error("Error querying S3. Bucket: #{bucket_name}. DOI: #{@doi}. Exception: #{ex.message}")
×
202

203
    { objects: [], ok: false }
×
204
  end
205

206
  ##
207
  # Copies the existing files from the pre-curation bucket to the post-curation bucket.
208
  # Notice that the copy process happens at AWS (i.e. the files are not downloaded and re-uploaded).
209
  # Returns an array with the files that were copied.
210
  def publish_files(current_user)
1✔
211
    source_bucket = S3QueryService.pre_curation_config[:bucket]
×
212
    target_bucket = S3QueryService.post_curation_config[:bucket]
×
213
    empty_files = client_s3_empty_files(reload: true, bucket_name: source_bucket)
×
214
    # Do not move the empty files, however, ensure that it is noted that the
215
    #   presence of empty files is specified in the provenance log.
216
    unless empty_files.empty?
×
217
      empty_files.each do |empty_file|
×
218
        message = "Warning: Attempted to publish empty S3 file #{empty_file.filename}."
×
219
        WorkActivity.add_work_activity(model.id, message, current_user.id, activity_type: WorkActivity::SYSTEM)
×
220
      end
221
    end
222

223
    files = client_s3_files(reload: true, bucket_name: source_bucket)
×
224
    snapshot = ApprovedUploadSnapshot.new(work: model)
×
225
    snapshot.store_files(files, current_user:)
×
226
    snapshot.save
×
227
    files.each do |file|
×
228
      ApprovedFileMoveJob.perform_later(work_id: model.id, source_bucket:, source_key: file.key, target_bucket:,
×
229
                                        target_key: file.key, size: file.size, snapshot_id: snapshot.id)
230
    end
231
    true
×
232
  end
233

234
  def copy_file(source_key:, target_bucket:, target_key:, size:)
1✔
235
    Rails.logger.info("Copying #{source_key} to #{target_bucket}/#{target_key}")
×
236
    if size > part_size
×
237
      copy_multi_part(source_key:, target_bucket:, target_key:, size:)
×
238
    else
239
      client.copy_object(copy_source: source_key.gsub("+", "%2B"), bucket: target_bucket, key: target_key, checksum_algorithm: "SHA256")
×
240
    end
241
  rescue Aws::Errors::ServiceError => aws_service_error
242
    message = "An error was encountered when requesting to copy AWS S3 Object from #{source_key} to #{target_key} in the bucket #{target_bucket}: #{aws_service_error}"
×
243
    Rails.logger.error(message)
×
244
    raise aws_service_error
×
245
  end
246

247
  def copy_multi_part(source_key:, target_bucket:, target_key:, size:)
1✔
248
    multi = client.create_multipart_upload(bucket: target_bucket, key: target_key, checksum_algorithm: "SHA256")
×
249
    part_num = 0
×
250
    start_byte = 0
×
251
    parts = []
×
252
    while start_byte < size
×
253
      part_num += 1
×
254
      end_byte = [start_byte + part_size, size].min - 1
×
255
      resp = client.upload_part_copy(bucket: target_bucket, copy_source: source_key, key: multi.key, part_number: part_num,
×
256
                                     upload_id: multi.upload_id, copy_source_range: "bytes=#{start_byte}-#{end_byte}")
257
      parts << { etag: resp.copy_part_result.etag, part_number: part_num, checksum_sha256: resp.copy_part_result.checksum_sha256 }
×
258
      start_byte = end_byte + 1
×
259
    end
260
    client.complete_multipart_upload(bucket: target_bucket, key: target_key, upload_id: multi.upload_id, multipart_upload: { parts: })
×
261
  rescue Aws::Errors::ServiceError => aws_service_error
262
    message = "An error was encountered when requesting to multipart copy AWS S3 Object from #{source_key} to #{target_key} in the bucket #{target_bucket}: #{aws_service_error}"
×
263
    Rails.logger.error(message)
×
264
    raise aws_service_error
×
265
  end
266

267
  def copy_directory(source_key:, target_bucket:, target_key:)
1✔
268
    client.copy_object(copy_source: source_key, bucket: target_bucket, key: target_key)
×
269
  rescue Aws::Errors::ServiceError => aws_service_error
270
    message = "An error was encountered when requesting to copy the AWS S3 directory Object from #{source_key} to #{target_key} in the bucket #{target_bucket}: #{aws_service_error}"
×
271
    Rails.logger.error(message)
×
272
    raise aws_service_error
×
273
  end
274

275
  def delete_s3_object(s3_file_key, bucket: bucket_name)
1✔
276
    resp = client.delete_object({ bucket:, key: s3_file_key })
×
277
    resp.to_h
×
278
  rescue Aws::Errors::ServiceError => aws_service_error
279
    message = "An error was encountered when requesting to delete the AWS S3 Object #{s3_file_key} in the bucket #{bucket_name}: #{aws_service_error}"
×
280
    Rails.logger.error(message)
×
281
    raise aws_service_error
×
282
  end
283

284
  def create_directory
1✔
285
    client.put_object({ bucket: bucket_name, key: prefix, content_length: 0 })
×
286
  rescue Aws::Errors::ServiceError => aws_service_error
287
    message = "An error was encountered when requesting to create the AWS S3 directory Object in the bucket #{bucket_name} with the key #{prefix}: #{aws_service_error}"
×
288
    Rails.logger.error(message)
×
289
    raise aws_service_error
×
290
  end
291

292
  def upload_file(io:, filename:, size:, md5_digest: nil)
1✔
293
    # upload file from io in a single request, may not exceed 5GB
294
    key = "#{prefix}#{filename}"
×
295
    if size > part_size
×
296
      upload_multipart_file(target_bucket: bucket_name, target_key: key, size:, io:)
×
297
    else
298
      md5_digest ||= md5(io:)
×
299
      @last_response = client.put_object(bucket: bucket_name, key:, body: io, content_md5: md5_digest)
×
300
    end
301
    key
×
302
  rescue Aws::S3::Errors::SignatureDoesNotMatch => e
303
    Honeybadger.notify("Error Uploading file #{filename} for object: #{s3_address} Signature did not match! error: #{e}")
×
304
    false
×
305
  rescue Aws::Errors::ServiceError => aws_service_error
306
    message = "An error was encountered when requesting to create the AWS S3 Object in the bucket #{bucket_name} with the key #{key}: #{aws_service_error}"
×
307
    Rails.logger.error(message)
×
308
    raise aws_service_error
×
309
  end
310

311
  def check_file(bucket:, key:)
1✔
312
    client.head_object({ bucket:, key: })
×
313
  rescue Aws::Errors::ServiceError => aws_service_error
314
    message = "An error was encountered when requesting to check the status of the AWS S3 Object in the bucket #{bucket} with the key #{key}: #{aws_service_error}"
×
315
    Rails.logger.error(message)
×
316
    raise aws_service_error
×
317
  end
318

319
  def md5(io:)
1✔
320
    md5 = Digest::MD5.new
×
321
    io.each(10_000) { |block| md5.update block }
×
322
    io.rewind
×
323
    md5.base64digest
×
324
  end
325

326
  def count_objects(bucket_name: self.bucket_name, prefix: self.prefix)
1✔
327
    responses = s3_responses(bucket_name:, prefix:)
×
328
    responses.reduce(0) { |total, resp| total + resp.key_count }
×
329
  end
330

331
  private
1✔
332

333
    def clear_s3_responses(bucket_name:, prefix:)
1✔
334
      key = "#{bucket_name} #{prefix}"
×
335
      @s3_responses[key] = nil
×
336
    end
337

338
    def s3_responses(bucket_name:, prefix:)
1✔
339
      key = "#{bucket_name} #{prefix}"
×
340
      responses = @s3_responses[key]
×
341
      if responses.nil?
×
342
        resp = client.list_objects_v2({ bucket: bucket_name, max_keys: 1000, prefix: })
×
343
        responses = [resp]
×
344
        while resp.is_truncated
×
345
          resp = client.list_objects_v2({ bucket: bucket_name, max_keys: 1000, prefix:, continuation_token: resp.next_continuation_token })
×
346
          responses << resp
×
347
        end
348
        @s3_responses[key] = responses
×
349
      end
350
      responses
×
351
    end
352

353
    def get_s3_objects(bucket_name:, prefix:)
1✔
354
      start = Time.zone.now
×
355
      responses = s3_responses(bucket_name:, prefix:)
×
356
      objects = responses.reduce([]) do |all_objects, resp|
×
357
        resp_hash = resp.to_h
×
358
        resp_objects = parse_objects(resp_hash)
×
359
        all_objects + resp_objects
×
360
      end
361
      elapsed = Time.zone.now - start
×
362
      Rails.logger.info("Loading S3 objects. Bucket: #{bucket_name}. Prefix: #{prefix}. Elapsed: #{elapsed} seconds")
×
363
      objects
×
364
    end
365

366
    def parse_objects(resp)
1✔
367
      objects = []
×
368
      resp_hash = resp.to_h
×
369
      response_objects = resp_hash[:contents]
×
370
      response_objects&.each do |object|
×
371
        s3_file = S3File.new(work: model, filename: object[:key], last_modified: object[:last_modified], size: object[:size], checksum: object[:etag])
×
372
        objects << s3_file
×
373
      end
374
      objects
×
375
    end
376

377
    def upload_multipart_file(target_bucket:, target_key:, size:, io:)
1✔
378
      multi = client.create_multipart_upload(bucket: target_bucket, key: target_key)
×
379
      part_num = 0
×
380
      start_byte = 0
×
381
      parts = []
×
382
      while start_byte < size
×
383
        part_num += 1
×
384
        Tempfile.open("mutlipart-upload") do |file|
×
385
          IO.copy_stream(io, file, part_size)
×
386
          file.rewind
×
387
          checksum = md5(io: file)
×
388
          resp = client.upload_part(body: file, bucket: target_bucket, key: multi.key, part_number: part_num, upload_id: multi.upload_id, content_md5: checksum)
×
389
          parts << { etag: resp.etag, part_number: part_num }
×
390
        end
391
        start_byte += part_size
×
392
      end
393
      @last_response = client.complete_multipart_upload(bucket: target_bucket, key: target_key, upload_id: multi.upload_id, multipart_upload: { parts: })
×
394
    rescue Aws::Errors::ServiceError => aws_service_error
395
      message = "An error was encountered when requesting to multipart upload to AWS S3 Object to #{target_key} in the bucket #{target_bucket}: #{aws_service_error}"
×
396
      Rails.logger.error(message)
×
397
      raise aws_service_error
×
398
    end
399
end
400
# rubocop:enable Metrics/ClassLength
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc