• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

pulibrary / pdc_describe / 7ae96b6b-0a64-4479-9fa1-8ba6526e2c87

20 Mar 2024 12:42PM UTC coverage: 30.068% (-66.2%) from 96.266%
7ae96b6b-0a64-4479-9fa1-8ba6526e2c87

Pull #1701

circleci

leefaisonr
makes it so that links open in new window
Pull Request #1701: Update language on submission form

1019 of 3389 relevant lines covered (30.07%)

0.4 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

22.81
/app/services/s3_query_service.rb
1
# frozen_string_literal: true
2

3
require "aws-sdk-s3"
1✔
4

5
# A service to query an S3 bucket for information about a given data set
6
# rubocop:disable Metrics/ClassLength
7
class S3QueryService
1✔
8
  attr_reader :model
1✔
9

10
  PRECURATION = "precuration"
1✔
11
  POSTCURATION = "postcuration"
1✔
12
  PRESERVATION = "preservation"
1✔
13

14
  def self.configuration
1✔
15
    Rails.configuration.s3
4✔
16
  end
17

18
  def self.pre_curation_config
1✔
19
    configuration.pre_curation
4✔
20
  end
21

22
  def self.post_curation_config
1✔
23
    configuration.post_curation
×
24
  end
25

26
  def self.preservation_config
1✔
27
    configuration.preservation
×
28
  end
29

30
  attr_reader :part_size, :last_response
1✔
31

32
  ##
33
  # @param [Work] model
34
  # @param [String] mode Valid values are "precuration", "postcuration", "preservation".
35
  #                      This value controlls the AWS S3 bucket used to access the files.
36
  # @example S3QueryService.new(Work.find(1), "precuration")
37
  def initialize(model, mode = "precuration")
1✔
38
    @model = model
×
39
    @doi = model.doi
×
40
    @mode = mode
×
41
    @part_size = 5_368_709_120 # 5GB is the maximum part size for AWS
×
42
    @last_response = nil
×
43
    @s3_responses = {}
×
44
  end
45

46
  def config
1✔
47
    if @mode == PRESERVATION
×
48
      self.class.preservation_config
×
49
    elsif @mode == POSTCURATION
×
50
      self.class.post_curation_config
×
51
    elsif @mode == PRECURATION
×
52
      self.class.pre_curation_config
×
53
    else
54
      raise ArgumentError, "Invalid mode value: #{@mode}"
×
55
    end
56
  end
57

58
  def pre_curation?
1✔
59
    @mode == PRECURATION
×
60
  end
61

62
  def post_curation?
1✔
63
    @mode == POSTCURATION
×
64
  end
65

66
  ##
67
  # The name of the bucket this class is configured to use.
68
  # See config/s3.yml for configuration file.
69
  def bucket_name
1✔
70
    config.fetch(:bucket, nil)
×
71
  end
72

73
  def region
1✔
74
    config.fetch(:region, nil)
×
75
  end
76

77
  ##
78
  # The S3 prefix for this object, i.e., the address within the S3 bucket,
79
  # which is based on the DOI
80
  def prefix
1✔
81
    "#{@doi}/#{model.id}/"
×
82
  end
83

84
  ##
85
  # Construct an S3 address for this data set
86
  def s3_address
1✔
87
    "s3://#{bucket_name}/#{prefix}"
×
88
  end
89

90
  ##
91
  # Public signed URL to fetch this file from the S3 (valid for a limited time)
92
  def file_url(key)
1✔
93
    signer = Aws::S3::Presigner.new(client:)
×
94
    signer.presigned_url(:get_object, bucket: bucket_name, key:)
×
95
  end
96

97
  # There is probably a better way to fetch the current ActiveStorage configuration but we have
98
  # not found it.
99
  def active_storage_configuration
1✔
100
    Rails.configuration.active_storage.service_configurations[Rails.configuration.active_storage.service.to_s]
×
101
  end
102

103
  def access_key_id
1✔
104
    active_storage_configuration["access_key_id"]
×
105
  end
106

107
  def secret_access_key
1✔
108
    active_storage_configuration["secret_access_key"]
×
109
  end
110

111
  def credentials
1✔
112
    @credentials ||= Aws::Credentials.new(access_key_id, secret_access_key)
×
113
  end
114

115
  def client
1✔
116
    @client ||= Aws::S3::Client.new(region:, credentials:)
×
117
  end
118

119
  # required, accepts ETag, Checksum, ObjectParts, StorageClass, ObjectSize
120
  def self.object_attributes
1✔
121
    [
×
122
      "ETag",
123
      "Checksum",
124
      "ObjectParts",
125
      "StorageClass",
126
      "ObjectSize"
127
    ]
128
  end
129

130
  def get_s3_object_attributes(key:)
1✔
131
    response = client.get_object_attributes({
×
132
                                              bucket: bucket_name,
133
                                              key:,
134
                                              object_attributes: self.class.object_attributes
135
                                            })
136
    response.to_h
×
137
  end
138

139
  def get_s3_object(key:)
1✔
140
    response = client.get_object({
×
141
                                   bucket: bucket_name,
142
                                   key:
143
                                 })
144
    object = response.to_h
×
145
    return if object.empty?
×
146

147
    object
×
148
  rescue Aws::Errors::ServiceError => aws_service_error
149
    message = "An error was encountered when requesting the AWS S3 Object #{key}: #{aws_service_error}"
×
150
    Rails.logger.error(message)
×
151
    raise aws_service_error
×
152
  end
153

154
  def build_s3_object_key(filename:)
1✔
155
    "#{prefix}#{filename}"
×
156
  end
157

158
  def find_s3_file(filename:)
1✔
159
    s3_object_key = build_s3_object_key(filename:)
×
160

161
    object = get_s3_object_attributes(key: s3_object_key)
×
162
    return if object.nil?
×
163

164
    S3File.new(work: model, filename: s3_object_key, last_modified: object[:last_modified], size: object[:object_size], checksum: object[:etag])
×
165
  end
166

167
  # Retrieve the S3 resources uploaded to the S3 Bucket
168
  # @return [Array<S3File>]
169
  def client_s3_files(reload: false, bucket_name: self.bucket_name, prefix: self.prefix, ignore_directories: true)
1✔
170
    if reload # force a reload
×
171
      @client_s3_files = nil
×
172
      clear_s3_responses(bucket_name:, prefix:)
×
173
    end
174
    @client_s3_files ||= get_s3_objects(bucket_name:, prefix:, ignore_directories:)
×
175
  end
176

177
  def client_s3_empty_files(reload: false, bucket_name: self.bucket_name, prefix: self.prefix)
1✔
178
    if reload # force a reload
×
179
      @client_s3_empty_files = nil
×
180
      clear_s3_responses(bucket_name:, prefix:)
×
181
    end
182
    @client_s3_empty_files ||= begin
×
183
      files_and_directories = get_s3_objects(bucket_name:, prefix:, ignore_directories: false)
×
184
      files_and_directories.select { |object| !object.filename.ends_with?("/") && object.empty? }
×
185
    end
186
  end
187

188
  def file_count
1✔
189
    client_s3_files.count
×
190
  rescue Aws::Errors::ServiceError => aws_service_error
191
    message = "An error was encountered when requesting AWS S3 Objects from the bucket #{bucket_name} with the prefix #{prefix}: #{aws_service_error}"
×
192
    Rails.logger.error(message)
×
193
    raise aws_service_error
×
194
  end
195

196
  ##
197
  # Query the S3 bucket for what we know about the doi
198
  # For docs see:
199
  # * https://docs.aws.amazon.com/sdk-for-ruby/v3/api/Aws/S3/Client.html#list_objects_v2-instance_method
200
  # * https://docs.aws.amazon.com/sdk-for-ruby/v3/api/Aws/S3/Client.html#get_object_attributes-instance_method
201
  # @return Hash with two properties {objects: [<S3File>], ok: Bool}
202
  #   objects is an Array of S3File objects
203
  #   ok is false if there is an error connecting to S3. Otherwise true.
204
  def data_profile
1✔
205
    { objects: client_s3_files, ok: true }
×
206
  rescue => ex
207
    Rails.logger.error("Error querying S3. Bucket: #{bucket_name}. DOI: #{@doi}. Exception: #{ex.message}")
×
208

209
    { objects: [], ok: false }
×
210
  end
211

212
  ##
213
  # Copies the existing files from the pre-curation bucket to the post-curation bucket.
214
  # Notice that the copy process happens at AWS (i.e. the files are not downloaded and re-uploaded).
215
  # Returns an array with the files that were copied.
216
  def publish_files(current_user)
1✔
217
    source_bucket = S3QueryService.pre_curation_config[:bucket]
×
218
    target_bucket = S3QueryService.post_curation_config[:bucket]
×
219
    empty_files = client_s3_empty_files(reload: true, bucket_name: source_bucket)
×
220
    # Do not move the empty files, however, ensure that it is noted that the
221
    #   presence of empty files is specified in the provenance log.
222
    unless empty_files.empty?
×
223
      empty_files.each do |empty_file|
×
224
        message = "Warning: Attempted to publish empty S3 file #{empty_file.filename}."
×
225
        WorkActivity.add_work_activity(model.id, message, current_user.id, activity_type: WorkActivity::SYSTEM)
×
226
      end
227
    end
228

229
    files = client_s3_files(reload: true, bucket_name: source_bucket)
×
230
    snapshot = ApprovedUploadSnapshot.new(work: model)
×
231
    snapshot.store_files(files, current_user:)
×
232
    snapshot.save
×
233
    files.each do |file|
×
234
      ApprovedFileMoveJob.perform_later(work_id: model.id, source_bucket:, source_key: file.key, target_bucket:,
×
235
                                        target_key: file.key, size: file.size, snapshot_id: snapshot.id)
236
    end
237
    true
×
238
  end
239

240
  def copy_file(source_key:, target_bucket:, target_key:, size:)
1✔
241
    Rails.logger.info("Copying #{source_key} to #{target_bucket}/#{target_key}")
×
242
    if size > part_size
×
243
      copy_multi_part(source_key:, target_bucket:, target_key:, size:)
×
244
    else
245
      client.copy_object(copy_source: source_key, bucket: target_bucket, key: target_key, checksum_algorithm: "SHA256")
×
246
    end
247
  rescue Aws::Errors::ServiceError => aws_service_error
248
    message = "An error was encountered when requesting to copy AWS S3 Object from #{source_key} to #{target_key} in the bucket #{target_bucket}: #{aws_service_error}"
×
249
    Rails.logger.error(message)
×
250
    raise aws_service_error
×
251
  end
252

253
  def copy_multi_part(source_key:, target_bucket:, target_key:, size:)
1✔
254
    multi = client.create_multipart_upload(bucket: target_bucket, key: target_key, checksum_algorithm: "SHA256")
×
255
    part_num = 0
×
256
    start_byte = 0
×
257
    parts = []
×
258
    while start_byte < size
×
259
      part_num += 1
×
260
      end_byte = [start_byte + part_size, size].min - 1
×
261
      resp = client.upload_part_copy(bucket: target_bucket, copy_source: source_key, key: multi.key, part_number: part_num,
×
262
                                     upload_id: multi.upload_id, copy_source_range: "bytes=#{start_byte}-#{end_byte}")
263
      parts << { etag: resp.copy_part_result.etag, part_number: part_num, checksum_sha256: resp.copy_part_result.checksum_sha256 }
×
264
      start_byte = end_byte + 1
×
265
    end
266
    client.complete_multipart_upload(bucket: target_bucket, key: target_key, upload_id: multi.upload_id, multipart_upload: { parts: })
×
267
  rescue Aws::Errors::ServiceError => aws_service_error
268
    message = "An error was encountered when requesting to multipart copy AWS S3 Object from #{source_key} to #{target_key} in the bucket #{target_bucket}: #{aws_service_error}"
×
269
    Rails.logger.error(message)
×
270
    raise aws_service_error
×
271
  end
272

273
  def copy_directory(source_key:, target_bucket:, target_key:)
1✔
274
    client.copy_object(copy_source: source_key, bucket: target_bucket, key: target_key)
×
275
  rescue Aws::Errors::ServiceError => aws_service_error
276
    message = "An error was encountered when requesting to copy the AWS S3 directory Object from #{source_key} to #{target_key} in the bucket #{target_bucket}: #{aws_service_error}"
×
277
    Rails.logger.error(message)
×
278
    raise aws_service_error
×
279
  end
280

281
  def delete_s3_object(s3_file_key, bucket: bucket_name)
1✔
282
    resp = client.delete_object({ bucket:, key: s3_file_key })
×
283
    resp.to_h
×
284
  rescue Aws::Errors::ServiceError => aws_service_error
285
    message = "An error was encountered when requesting to delete the AWS S3 Object #{s3_file_key} in the bucket #{bucket_name}: #{aws_service_error}"
×
286
    Rails.logger.error(message)
×
287
    raise aws_service_error
×
288
  end
289

290
  def create_directory
1✔
291
    client.put_object({ bucket: bucket_name, key: prefix, content_length: 0 })
×
292
  rescue Aws::Errors::ServiceError => aws_service_error
293
    message = "An error was encountered when requesting to create the AWS S3 directory Object in the bucket #{bucket_name} with the key #{prefix}: #{aws_service_error}"
×
294
    Rails.logger.error(message)
×
295
    raise aws_service_error
×
296
  end
297

298
  def upload_file(io:, filename:, size:, md5_digest: nil)
1✔
299
    # upload file from io in a single request, may not exceed 5GB
300
    key = "#{prefix}#{filename}"
×
301
    if size > part_size
×
302
      upload_multipart_file(target_bucket: bucket_name, target_key: key, size:, io:)
×
303
    else
304
      md5_digest ||= md5(io:)
×
305
      @last_response = client.put_object(bucket: bucket_name, key:, body: io, content_md5: md5_digest)
×
306
    end
307
    key
×
308
  rescue Aws::S3::Errors::SignatureDoesNotMatch => e
309
    Honeybadger.notify("Error Uploading file #{filename} for object: #{s3_address} Signature did not match! error: #{e}")
×
310
    false
×
311
  rescue Aws::Errors::ServiceError => aws_service_error
312
    message = "An error was encountered when requesting to create the AWS S3 Object in the bucket #{bucket_name} with the key #{key}: #{aws_service_error}"
×
313
    Rails.logger.error(message)
×
314
    raise aws_service_error
×
315
  end
316

317
  def check_file(bucket:, key:)
1✔
318
    client.head_object({ bucket:, key: })
×
319
  rescue Aws::Errors::ServiceError => aws_service_error
320
    message = "An error was encountered when requesting to check the status of the AWS S3 Object in the bucket #{bucket} with the key #{key}: #{aws_service_error}"
×
321
    Rails.logger.error(message)
×
322
    raise aws_service_error
×
323
  end
324

325
  def md5(io:)
1✔
326
    md5 = Digest::MD5.new
×
327
    io.each(10_000) { |block| md5.update block }
×
328
    io.rewind
×
329
    md5.base64digest
×
330
  end
331

332
  def count_objects(bucket_name: self.bucket_name, prefix: self.prefix)
1✔
333
    responses = s3_responses(bucket_name:, prefix:)
×
334
    total_key_count = responses.reduce(0) { |total, resp| total + resp.key_count }
×
335
    total_key_count - 1 # s3 always sends back the bucket key as the first response, so we should not count it
×
336
  end
337

338
  private
1✔
339

340
    def clear_s3_responses(bucket_name:, prefix:)
1✔
341
      key = "#{bucket_name} #{prefix}"
×
342
      @s3_responses[key] = nil
×
343
    end
344

345
    def s3_responses(bucket_name:, prefix:)
1✔
346
      key = "#{bucket_name} #{prefix}"
×
347
      responses = @s3_responses[key]
×
348
      if responses.nil?
×
349
        resp = client.list_objects_v2({ bucket: bucket_name, max_keys: 1000, prefix: })
×
350
        responses = [resp]
×
351
        while resp.is_truncated
×
352
          resp = client.list_objects_v2({ bucket: bucket_name, max_keys: 1000, prefix:, continuation_token: resp.next_continuation_token })
×
353
          responses << resp
×
354
        end
355
        @s3_responses[key] = responses
×
356
      end
357
      responses
×
358
    end
359

360
    def get_s3_objects(bucket_name:, prefix:, ignore_directories:)
1✔
361
      start = Time.zone.now
×
362
      responses = s3_responses(bucket_name:, prefix:)
×
363
      objects = responses.reduce([]) do |all_objects, resp|
×
364
        resp_hash = resp.to_h
×
365
        resp_objects = parse_objects(resp_hash, ignore_directories:)
×
366
        all_objects + resp_objects
×
367
      end
368
      elapsed = Time.zone.now - start
×
369
      Rails.logger.info("Loading S3 objects. Bucket: #{bucket_name}. Prefix: #{prefix}. Elapsed: #{elapsed} seconds")
×
370
      objects
×
371
    end
372

373
    def parse_objects(resp, ignore_directories: true)
1✔
374
      objects = []
×
375
      resp_hash = resp.to_h
×
376
      response_objects = resp_hash[:contents]
×
377
      response_objects&.each do |object|
×
378
        next if object[:size] == 0 && ignore_directories
×
379
        s3_file = S3File.new(work: model, filename: object[:key], last_modified: object[:last_modified], size: object[:size], checksum: object[:etag])
×
380
        objects << s3_file
×
381
      end
382
      objects
×
383
    end
384

385
    def upload_multipart_file(target_bucket:, target_key:, size:, io:)
1✔
386
      multi = client.create_multipart_upload(bucket: target_bucket, key: target_key)
×
387
      part_num = 0
×
388
      start_byte = 0
×
389
      parts = []
×
390
      while start_byte < size
×
391
        part_num += 1
×
392
        Tempfile.open("mutlipart-upload") do |file|
×
393
          IO.copy_stream(io, file, part_size)
×
394
          file.rewind
×
395
          checksum = md5(io: file)
×
396
          resp = client.upload_part(body: file, bucket: target_bucket, key: multi.key, part_number: part_num, upload_id: multi.upload_id, content_md5: checksum)
×
397
          parts << { etag: resp.etag, part_number: part_num }
×
398
        end
399
        start_byte += part_size
×
400
      end
401
      @last_response = client.complete_multipart_upload(bucket: target_bucket, key: target_key, upload_id: multi.upload_id, multipart_upload: { parts: })
×
402
    rescue Aws::Errors::ServiceError => aws_service_error
403
      message = "An error was encountered when requesting to multipart upload to AWS S3 Object to #{target_key} in the bucket #{target_bucket}: #{aws_service_error}"
×
404
      Rails.logger.error(message)
×
405
      raise aws_service_error
×
406
    end
407
end
408
# rubocop:enable Metrics/ClassLength
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc