• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

pulibrary / pdc_describe / 6e0486a6-b0d3-473d-bdb0-ec772992e9da

10 Apr 2025 07:46PM UTC coverage: 95.367% (-0.03%) from 95.399%
6e0486a6-b0d3-473d-bdb0-ec772992e9da

Pull #2094

circleci

hectorcorrea
Fixed test

Co-authored-by: Robert-Anthony Lee-Faison <leefaisonr@users.noreply.github.com>
Pull Request #2094: Move files to embargo bucket for approved embargoed works

22 of 24 new or added lines in 3 files covered. (91.67%)

28 existing lines in 3 files now uncovered.

3479 of 3648 relevant lines covered (95.37%)

398.81 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

93.67
/app/services/s3_query_service.rb
1
# frozen_string_literal: true
2

3
require "aws-sdk-s3"
2✔
4

5
# A service to query an S3 bucket for information about a given data set
6
# rubocop:disable Metrics/ClassLength
7
class S3QueryService
2✔
8
  attr_reader :model
2✔
9

10
  attr_reader :part_size, :last_response, :s3client
2✔
11

12
  delegate "pre_curation?", "post_curation?", :bucket_name, :region, :client, to: :s3client
2✔
13

14
  ##
15
  # @param [Work] model
16
  # @param [String] mode Valid values are PULS3Client::PRECURATION, PULS3Client::POSTCURATION
17
  #                          PULS3Client::PRESERVATION, and PULS3Client::EMBARGO.
18
  #                      This value controls the AWS S3 bucket used to access the files.
19
  # @example S3QueryService.new(Work.find(1), "precuration")
20
  def initialize(model, mode = PULS3Client::PRECURATION)
2✔
21
    @model = model
124✔
22
    @doi = model.doi
124✔
23
    @s3client = PULS3Client.new(mode)
124✔
24
    @part_size = 5_368_709_120 # 5GB is the maximum part size for AWS
124✔
25
    @last_response = nil
124✔
26
    @s3_responses = {}
124✔
27
  end
28

29
  ##
30
  # The S3 prefix for this object, i.e., the address within the S3 bucket,
31
  # which is based on the DOI
32
  def prefix
2✔
33
    "#{@doi}/#{model.id}/"
150✔
34
  end
35

36
  ##
37
  # Construct an S3 address for this data set
38
  def s3_address
2✔
39
    "s3://#{bucket_name}/#{prefix}"
2✔
40
  end
41

42
  ##
43
  # Public signed URL to fetch this file from the S3 (valid for a limited time)
44
  def file_url(key)
2✔
45
    signer = Aws::S3::Presigner.new(client:)
2✔
46
    signer.presigned_url(:get_object, bucket: bucket_name, key:)
2✔
47
  end
48

49
  # required, accepts ETag, Checksum, ObjectParts, StorageClass, ObjectSize
50
  def self.object_attributes
2✔
51
    [
20✔
52
      "ETag",
53
      "Checksum",
54
      "ObjectParts",
55
      "StorageClass",
56
      "ObjectSize"
57
    ]
58
  end
59

60
  def get_s3_object_attributes(key:)
2✔
61
    response = client.get_object_attributes({
20✔
62
                                              bucket: bucket_name,
63
                                              key:,
64
                                              object_attributes: self.class.object_attributes
65
                                            })
66
    response.to_h
20✔
67
  end
68

69
  def get_s3_object(key:)
2✔
70
    response = client.get_object({
4✔
71
                                   bucket: bucket_name,
72
                                   key:
73
                                 })
74
    object = response.to_h
2✔
75
    return if object.empty?
2✔
76

77
    object
2✔
78
  rescue Aws::Errors::ServiceError => aws_service_error
79
    message = "An error was encountered when requesting the AWS S3 Object #{key}: #{aws_service_error}"
2✔
80
    Rails.logger.error(message)
2✔
81
    raise aws_service_error
2✔
82
  end
83

84
  def build_s3_object_key(filename:)
2✔
85
    "#{prefix}#{filename}"
2✔
86
  end
87

88
  def find_s3_file(filename:)
2✔
89
    s3_object_key = build_s3_object_key(filename:)
2✔
90

91
    object = get_s3_object_attributes(key: s3_object_key)
2✔
92
    return if object.nil?
2✔
93

94
    S3File.new(work: model, filename: s3_object_key, last_modified: object[:last_modified], size: object[:object_size], checksum: object[:etag])
2✔
95
  end
96

97
  # Retrieve the S3 resources uploaded to the S3 Bucket
98
  # @return [Array<S3File>]
99
  def client_s3_files(reload: false, bucket_name: self.bucket_name, prefix: self.prefix)
2✔
100
    if reload # force a reload
54✔
101
      @client_s3_files = nil
22✔
102
      clear_s3_responses(bucket_name:, prefix:)
22✔
103
    end
104
    @client_s3_files ||= get_s3_objects(bucket_name:, prefix:)
54✔
105
  end
106

107
  def client_s3_empty_files(reload: false, bucket_name: self.bucket_name, prefix: self.prefix)
2✔
108
    if reload # force a reload
18✔
109
      @client_s3_empty_files = nil
12✔
110
      clear_s3_responses(bucket_name:, prefix:)
12✔
111
    end
112
    @client_s3_empty_files ||= begin
18✔
113
      files_and_directories = get_s3_objects(bucket_name:, prefix:)
16✔
114
      files_and_directories.select(&:empty?)
16✔
115
    end
116
  end
117

118
  def file_count
2✔
119
    client_s3_files.count
4✔
120
  rescue Aws::Errors::ServiceError => aws_service_error
121
    message = "An error was encountered when requesting AWS S3 Objects from the bucket #{bucket_name} with the prefix #{prefix}: #{aws_service_error}"
2✔
122
    Rails.logger.error(message)
2✔
123
    raise aws_service_error
2✔
124
  end
125

126
  ##
127
  # Query the S3 bucket for what we know about the doi
128
  # For docs see:
129
  # * https://docs.aws.amazon.com/sdk-for-ruby/v3/api/Aws/S3/Client.html#list_objects_v2-instance_method
130
  # * https://docs.aws.amazon.com/sdk-for-ruby/v3/api/Aws/S3/Client.html#get_object_attributes-instance_method
131
  # @return Hash with two properties {objects: [<S3File>], ok: Bool}
132
  #   objects is an Array of S3File objects
133
  #   ok is false if there is an error connecting to S3. Otherwise true.
134
  def data_profile
2✔
135
    { objects: client_s3_files, ok: true }
12✔
136
  rescue => ex
137
    Rails.logger.error("Error querying S3. Bucket: #{bucket_name}. DOI: #{@doi}. Exception: #{ex.message}")
4✔
138

139
    { objects: [], ok: false }
4✔
140
  end
141

142
  ##
143
  # Copies the existing files from the pre-curation bucket to the target bucket (postcuration or embargo).
144
  # Notice that the copy process happens at AWS (i.e. the files are not downloaded and re-uploaded).
145
  def publish_files(current_user)
2✔
146
    source_bucket = PULS3Client.pre_curation_config[:bucket]
8✔
147
    target_bucket = if model.embargoed?
8✔
NEW
UNCOV
148
                      PULS3Client.embargo_config[:bucket]
×
149
                    else
150
                      PULS3Client.post_curation_config[:bucket]
8✔
151
                    end
152

153
    empty_files = client_s3_empty_files(reload: true, bucket_name: source_bucket)
8✔
154
    # Do not move the empty files, however, ensure that it is noted that the
155
    #   presence of empty files is specified in the provenance log.
156
    unless empty_files.empty?
8✔
157
      empty_files.each do |empty_file|
8✔
158
        message = "Warning: Attempted to publish empty S3 file #{empty_file.filename}."
10✔
159
        WorkActivity.add_work_activity(model.id, message, current_user.id, activity_type: WorkActivity::SYSTEM)
10✔
160
      end
161
    end
162

163
    files = client_s3_files(reload: true, bucket_name: source_bucket)
8✔
164
    snapshot = ApprovedUploadSnapshot.new(work: model)
8✔
165
    snapshot.store_files(files, current_user:)
8✔
166
    snapshot.save
8✔
167
    files.each do |file|
8✔
168
      ApprovedFileMoveJob.perform_later(work_id: model.id, source_bucket:, source_key: file.key, target_bucket:,
26✔
169
                                        target_key: file.key, size: file.size, snapshot_id: snapshot.id)
170
    end
171
    true
8✔
172
  end
173

174
  def copy_file(source_key:, target_bucket:, target_key:, size:)
2✔
175
    Rails.logger.info("Copying #{source_key} to #{target_bucket}/#{target_key}")
18✔
176
    if size > part_size
18✔
177
      copy_multi_part(source_key:, target_bucket:, target_key:, size:)
12✔
178
    else
179
      client.copy_object(copy_source: source_key.gsub("+", "%2B"), bucket: target_bucket, key: target_key, checksum_algorithm: "SHA256")
6✔
180
    end
181
  rescue Aws::Errors::ServiceError => aws_service_error
182
    message = "An error was encountered when requesting to copy AWS S3 Object from #{source_key} to #{target_key} in the bucket #{target_bucket}: #{aws_service_error}"
×
183
    Rails.logger.error(message)
×
184
    raise aws_service_error
×
185
  end
186

187
  def copy_multi_part(source_key:, target_bucket:, target_key:, size:)
2✔
188
    multi = client.create_multipart_upload(bucket: target_bucket, key: target_key, checksum_algorithm: "SHA256")
12✔
189
    part_num = 0
12✔
190
    start_byte = 0
12✔
191
    parts = []
12✔
192
    while start_byte < size
12✔
193
      part_num += 1
24✔
194
      end_byte = [start_byte + part_size, size].min - 1
24✔
195
      resp = client.upload_part_copy(bucket: target_bucket, copy_source: source_key, key: multi.key, part_number: part_num,
24✔
196
                                     upload_id: multi.upload_id, copy_source_range: "bytes=#{start_byte}-#{end_byte}")
197
      parts << { etag: resp.copy_part_result.etag, part_number: part_num, checksum_sha256: resp.copy_part_result.checksum_sha256 }
24✔
198
      start_byte = end_byte + 1
24✔
199
    end
200
    client.complete_multipart_upload(bucket: target_bucket, key: target_key, upload_id: multi.upload_id, multipart_upload: { parts: })
12✔
201
  rescue Aws::Errors::ServiceError => aws_service_error
202
    message = "An error was encountered when requesting to multipart copy AWS S3 Object from #{source_key} to #{target_key} in the bucket #{target_bucket}: #{aws_service_error}"
×
203
    Rails.logger.error(message)
×
204
    raise aws_service_error
×
205
  end
206

207
  def copy_directory(source_key:, target_bucket:, target_key:)
2✔
208
    client.copy_object(copy_source: source_key, bucket: target_bucket, key: target_key)
4✔
209
  rescue Aws::Errors::ServiceError => aws_service_error
210
    message = "An error was encountered when requesting to copy the AWS S3 directory Object from #{source_key} to #{target_key} in the bucket #{target_bucket}: #{aws_service_error}"
2✔
211
    Rails.logger.error(message)
2✔
212
    raise aws_service_error
2✔
213
  end
214

215
  def delete_s3_object(s3_file_key, bucket: bucket_name)
2✔
216
    resp = client.delete_object({ bucket:, key: s3_file_key })
14✔
217
    resp.to_h
12✔
218
  rescue Aws::Errors::ServiceError => aws_service_error
219
    message = "An error was encountered when requesting to delete the AWS S3 Object #{s3_file_key} in the bucket #{bucket_name}: #{aws_service_error}"
2✔
220
    Rails.logger.error(message)
2✔
221
    raise aws_service_error
2✔
222
  end
223

224
  def create_directory
2✔
225
    client.put_object({ bucket: bucket_name, key: prefix, content_length: 0 })
4✔
226
  rescue Aws::Errors::ServiceError => aws_service_error
227
    message = "An error was encountered when requesting to create the AWS S3 directory Object in the bucket #{bucket_name} with the key #{prefix}: #{aws_service_error}"
2✔
228
    Rails.logger.error(message)
2✔
229
    raise aws_service_error
2✔
230
  end
231

232
  def upload_file(io:, filename:, size:, md5_digest: nil)
2✔
233
    key = "#{prefix}#{filename}"
2✔
234
    if s3client.upload_file(io:, target_key: key, size:, md5_digest:)
2✔
235
      key
2✔
236
    end
237
  end
238

239
  def check_file(bucket:, key:)
2✔
240
    client.head_object({ bucket:, key: })
14✔
241
  rescue Aws::Errors::ServiceError => aws_service_error
242
    message = "An error was encountered when requesting to check the status of the AWS S3 Object in the bucket #{bucket} with the key #{key}: #{aws_service_error}"
×
243
    Rails.logger.error(message)
×
244
    raise aws_service_error
×
245
  end
246

247
  private
2✔
248

249
    def clear_s3_responses(bucket_name:, prefix:)
2✔
250
      key = "#{bucket_name} #{prefix}"
34✔
251
      @s3_responses[key] = nil
34✔
252
    end
253

254
    def s3_responses(bucket_name:, prefix:)
2✔
255
      key = "#{bucket_name} #{prefix}"
60✔
256
      responses = @s3_responses[key]
60✔
257
      if responses.nil?
60✔
258
        resp = client.list_objects_v2({ bucket: bucket_name, max_keys: 1000, prefix: })
60✔
259
        responses = [resp]
54✔
260
        while resp.is_truncated
54✔
261
          resp = client.list_objects_v2({ bucket: bucket_name, max_keys: 1000, prefix:, continuation_token: resp.next_continuation_token })
20✔
262
          responses << resp
20✔
263
        end
264
        @s3_responses[key] = responses
54✔
265
      end
266
      responses
54✔
267
    end
268

269
    def get_s3_objects(bucket_name:, prefix:)
2✔
270
      start = Time.zone.now
60✔
271
      responses = s3_responses(bucket_name:, prefix:)
60✔
272
      objects = responses.reduce([]) do |all_objects, resp|
54✔
273
        resp_hash = resp.to_h
74✔
274
        resp_objects = parse_objects(resp_hash)
74✔
275
        all_objects + resp_objects
74✔
276
      end
277
      elapsed = Time.zone.now - start
54✔
278
      Rails.logger.info("Loading S3 objects. Bucket: #{bucket_name}. Prefix: #{prefix}. Elapsed: #{elapsed} seconds")
54✔
279
      objects
54✔
280
    end
281

282
    def parse_objects(resp)
2✔
283
      objects = []
74✔
284
      resp_hash = resp.to_h
74✔
285
      response_objects = resp_hash[:contents]
74✔
286
      response_objects&.each do |object|
74✔
287
        s3_file = S3File.new(work: model, filename: object[:key], last_modified: object[:last_modified], size: object[:size], checksum: object[:etag])
222✔
288
        objects << s3_file
222✔
289
      end
290
      objects
74✔
291
    end
292
end
293
# rubocop:enable Metrics/ClassLength
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc