• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

pulibrary / pdc_describe / e120814a-af4f-469a-9335-a1787b055802

14 Apr 2025 06:57PM UTC coverage: 95.43% (+0.07%) from 95.36%
e120814a-af4f-469a-9335-a1787b055802

Pull #2099

circleci

carolyncole
Add a service for moving embargo files in postcuration
This will move the files from post curation to the embargo bucket
Pull Request #2099: Add a service for moving embargo files in post-curation

26 of 26 new or added lines in 2 files covered. (100.0%)

13 existing lines in 2 files now uncovered.

3508 of 3676 relevant lines covered (95.43%)

392.09 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

94.23
/app/services/s3_query_service.rb
1
# frozen_string_literal: true
2

3
require "aws-sdk-s3"
2✔
4

5
# A service to query an S3 bucket for information about a given data set
6
# rubocop:disable Metrics/ClassLength
7
class S3QueryService
2✔
8
  attr_reader :model
2✔
9

10
  attr_reader :part_size, :last_response, :s3client
2✔
11

12
  delegate "pre_curation?", "post_curation?", :bucket_name, :region, :client, to: :s3client
2✔
13

14
  ##
15
  # @param [Work] model
16
  # @param [String] mode Valid values are PULS3Client::PRECURATION, PULS3Client::POSTCURATION
17
  #                          PULS3Client::PRESERVATION, and PULS3Client::EMBARGO.
18
  #                      This value controls the AWS S3 bucket used to access the files.
19
  # @example S3QueryService.new(Work.find(1), "precuration")
20
  def initialize(model, mode = PULS3Client::PRECURATION, bucket_name: nil)
2✔
21
    @model = model
124✔
22
    @doi = model.doi
124✔
23
    @s3client = PULS3Client.new(mode, bucket_name:)
124✔
24
    @part_size = 5_368_709_120 # 5GB is the maximum part size for AWS
124✔
25
    @last_response = nil
124✔
26
    @s3_responses = {}
124✔
27
  end
28

29
  ##
30
  # The S3 prefix for this object, i.e., the address within the S3 bucket,
31
  # which is based on the DOI
32
  def prefix
2✔
33
    "#{@doi}/#{model.id}/"
150✔
34
  end
35

36
  ##
37
  # Construct an S3 address for this data set
38
  def s3_address
2✔
39
    "s3://#{bucket_name}/#{prefix}"
2✔
40
  end
41

42
  ##
43
  # Public signed URL to fetch this file from the S3 (valid for a limited time)
44
  def file_url(key)
2✔
45
    signer = Aws::S3::Presigner.new(client:)
2✔
46
    signer.presigned_url(:get_object, bucket: bucket_name, key:)
2✔
47
  end
48

49
  # required, accepts ETag, Checksum, ObjectParts, StorageClass, ObjectSize
50
  def self.object_attributes
2✔
51
    [
20✔
52
      "ETag",
53
      "Checksum",
54
      "ObjectParts",
55
      "StorageClass",
56
      "ObjectSize"
57
    ]
58
  end
59

60
  def get_s3_object_attributes(key:)
2✔
61
    response = client.get_object_attributes({
20✔
62
                                              bucket: bucket_name,
63
                                              key:,
64
                                              object_attributes: self.class.object_attributes
65
                                            })
66
    response.to_h
20✔
67
  end
68

69
  def get_s3_object(key:)
2✔
70
    response = client.get_object({
4✔
71
                                   bucket: bucket_name,
72
                                   key:
73
                                 })
74
    object = response.to_h
2✔
75
    return if object.empty?
2✔
76

77
    object
2✔
78
  rescue Aws::Errors::ServiceError => aws_service_error
79
    message = "An error was encountered when requesting the AWS S3 Object #{key}: #{aws_service_error}"
2✔
80
    Rails.logger.error(message)
2✔
81
    raise aws_service_error
2✔
82
  end
83

84
  def build_s3_object_key(filename:)
2✔
85
    "#{prefix}#{filename}"
2✔
86
  end
87

88
  def find_s3_file(filename:)
2✔
89
    s3_object_key = build_s3_object_key(filename:)
2✔
90

91
    object = get_s3_object_attributes(key: s3_object_key)
2✔
92
    return if object.nil?
2✔
93

94
    S3File.new(work: model, filename: s3_object_key, last_modified: object[:last_modified], size: object[:object_size], checksum: object[:etag])
2✔
95
  end
96

97
  # Retrieve the S3 resources uploaded to the S3 Bucket
98
  # @return [Array<S3File>]
99
  def client_s3_files(reload: false, bucket_name: self.bucket_name, prefix: self.prefix)
2✔
100
    if reload # force a reload
54✔
101
      @client_s3_files = nil
22✔
102
      clear_s3_responses(bucket_name:, prefix:)
22✔
103
    end
104
    @client_s3_files ||= get_s3_objects(bucket_name:, prefix:)
54✔
105
  end
106

107
  def client_s3_empty_files(reload: false, bucket_name: self.bucket_name, prefix: self.prefix)
2✔
108
    if reload # force a reload
18✔
109
      @client_s3_empty_files = nil
12✔
110
      clear_s3_responses(bucket_name:, prefix:)
12✔
111
    end
112
    @client_s3_empty_files ||= begin
18✔
113
      files_and_directories = get_s3_objects(bucket_name:, prefix:)
16✔
114
      files_and_directories.select(&:empty?)
16✔
115
    end
116
  end
117

118
  def file_count
2✔
119
    client_s3_files.count
4✔
120
  rescue Aws::Errors::ServiceError => aws_service_error
121
    message = "An error was encountered when requesting AWS S3 Objects from the bucket #{bucket_name} with the prefix #{prefix}: #{aws_service_error}"
2✔
122
    Rails.logger.error(message)
2✔
123
    raise aws_service_error
2✔
124
  end
125

126
  ##
127
  # Query the S3 bucket for what we know about the doi
128
  # For docs see:
129
  # * https://docs.aws.amazon.com/sdk-for-ruby/v3/api/Aws/S3/Client.html#list_objects_v2-instance_method
130
  # * https://docs.aws.amazon.com/sdk-for-ruby/v3/api/Aws/S3/Client.html#get_object_attributes-instance_method
131
  # @return Hash with two properties {objects: [<S3File>], ok: Bool}
132
  #   objects is an Array of S3File objects
133
  #   ok is false if there is an error connecting to S3. Otherwise true.
134
  def data_profile
2✔
135
    { objects: client_s3_files, ok: true }
12✔
136
  rescue => ex
137
    Rails.logger.error("Error querying S3. Bucket: #{bucket_name}. DOI: #{@doi}. Exception: #{ex.message}")
4✔
138

139
    { objects: [], ok: false }
4✔
140
  end
141

142
  ##
143
  # Copies the existing files from the pre-curation bucket to the post-curation bucket.
144
  # Notice that the copy process happens at AWS (i.e. the files are not downloaded and re-uploaded).
145
  # Returns an array with the files that were copied.
146
  def publish_files(current_user)
2✔
147
    source_bucket = PULS3Client.pre_curation_config[:bucket]
8✔
148
    target_bucket = PULS3Client.post_curation_config[:bucket]
8✔
149
    empty_files = client_s3_empty_files(reload: true, bucket_name: source_bucket)
8✔
150
    # Do not move the empty files, however, ensure that it is noted that the
151
    #   presence of empty files is specified in the provenance log.
152
    unless empty_files.empty?
8✔
153
      empty_files.each do |empty_file|
8✔
154
        message = "Warning: Attempted to publish empty S3 file #{empty_file.filename}."
10✔
155
        WorkActivity.add_work_activity(model.id, message, current_user.id, activity_type: WorkActivity::SYSTEM)
10✔
156
      end
157
    end
158

159
    files = client_s3_files(reload: true, bucket_name: source_bucket)
8✔
160
    snapshot = ApprovedUploadSnapshot.new(work: model)
8✔
161
    snapshot.store_files(files, current_user:)
8✔
162
    snapshot.save
8✔
163
    files.each do |file|
8✔
164
      ApprovedFileMoveJob.perform_later(work_id: model.id, source_bucket:, source_key: file.key, target_bucket:,
26✔
165
                                        target_key: file.key, size: file.size, snapshot_id: snapshot.id)
166
    end
167
    true
8✔
168
  end
169

170
  def copy_file(source_key:, target_bucket:, target_key:, size:)
2✔
171
    Rails.logger.info("Copying #{source_key} to #{target_bucket}/#{target_key}")
18✔
172
    if size > part_size
18✔
173
      copy_multi_part(source_key:, target_bucket:, target_key:, size:)
12✔
174
    else
175
      client.copy_object(copy_source: source_key.gsub("+", "%2B"), bucket: target_bucket, key: target_key, checksum_algorithm: "SHA256")
6✔
176
    end
177
  rescue Aws::Errors::ServiceError => aws_service_error
UNCOV
178
    message = "An error was encountered when requesting to copy AWS S3 Object from #{source_key} to #{target_key} in the bucket #{target_bucket}: #{aws_service_error}"
×
UNCOV
179
    Rails.logger.error(message)
×
UNCOV
180
    raise aws_service_error
×
181
  end
182

183
  def copy_multi_part(source_key:, target_bucket:, target_key:, size:)
2✔
184
    multi = client.create_multipart_upload(bucket: target_bucket, key: target_key, checksum_algorithm: "SHA256")
12✔
185
    part_num = 0
12✔
186
    start_byte = 0
12✔
187
    parts = []
12✔
188
    while start_byte < size
12✔
189
      part_num += 1
24✔
190
      end_byte = [start_byte + part_size, size].min - 1
24✔
191
      resp = client.upload_part_copy(bucket: target_bucket, copy_source: source_key, key: multi.key, part_number: part_num,
24✔
192
                                     upload_id: multi.upload_id, copy_source_range: "bytes=#{start_byte}-#{end_byte}")
193
      parts << { etag: resp.copy_part_result.etag, part_number: part_num, checksum_sha256: resp.copy_part_result.checksum_sha256 }
24✔
194
      start_byte = end_byte + 1
24✔
195
    end
196
    client.complete_multipart_upload(bucket: target_bucket, key: target_key, upload_id: multi.upload_id, multipart_upload: { parts: })
12✔
197
  rescue Aws::Errors::ServiceError => aws_service_error
UNCOV
198
    message = "An error was encountered when requesting to multipart copy AWS S3 Object from #{source_key} to #{target_key} in the bucket #{target_bucket}: #{aws_service_error}"
×
UNCOV
199
    Rails.logger.error(message)
×
UNCOV
200
    raise aws_service_error
×
201
  end
202

203
  def copy_directory(source_key:, target_bucket:, target_key:)
2✔
204
    client.copy_object(copy_source: source_key, bucket: target_bucket, key: target_key)
4✔
205
  rescue Aws::Errors::ServiceError => aws_service_error
206
    message = "An error was encountered when requesting to copy the AWS S3 directory Object from #{source_key} to #{target_key} in the bucket #{target_bucket}: #{aws_service_error}"
2✔
207
    Rails.logger.error(message)
2✔
208
    raise aws_service_error
2✔
209
  end
210

211
  def delete_s3_object(s3_file_key, bucket: bucket_name)
2✔
212
    resp = client.delete_object({ bucket:, key: s3_file_key })
14✔
213
    resp.to_h
12✔
214
  rescue Aws::Errors::ServiceError => aws_service_error
215
    message = "An error was encountered when requesting to delete the AWS S3 Object #{s3_file_key} in the bucket #{bucket_name}: #{aws_service_error}"
2✔
216
    Rails.logger.error(message)
2✔
217
    raise aws_service_error
2✔
218
  end
219

220
  def create_directory
2✔
221
    client.put_object({ bucket: bucket_name, key: prefix, content_length: 0 })
4✔
222
  rescue Aws::Errors::ServiceError => aws_service_error
223
    message = "An error was encountered when requesting to create the AWS S3 directory Object in the bucket #{bucket_name} with the key #{prefix}: #{aws_service_error}"
2✔
224
    Rails.logger.error(message)
2✔
225
    raise aws_service_error
2✔
226
  end
227

228
  def upload_file(io:, filename:, size:, md5_digest: nil)
2✔
229
    key = "#{prefix}#{filename}"
2✔
230
    if s3client.upload_file(io:, target_key: key, size:, md5_digest:)
2✔
231
      key
2✔
232
    end
233
  end
234

235
  def check_file(bucket:, key:)
2✔
236
    client.head_object({ bucket:, key: })
14✔
237
  rescue Aws::Errors::ServiceError => aws_service_error
UNCOV
238
    message = "An error was encountered when requesting to check the status of the AWS S3 Object in the bucket #{bucket} with the key #{key}: #{aws_service_error}"
×
UNCOV
239
    Rails.logger.error(message)
×
UNCOV
240
    raise aws_service_error
×
241
  end
242

243
  private
2✔
244

245
    def clear_s3_responses(bucket_name:, prefix:)
2✔
246
      key = "#{bucket_name} #{prefix}"
34✔
247
      @s3_responses[key] = nil
34✔
248
    end
249

250
    def s3_responses(bucket_name:, prefix:)
2✔
251
      key = "#{bucket_name} #{prefix}"
60✔
252
      responses = @s3_responses[key]
60✔
253
      if responses.nil?
60✔
254
        resp = client.list_objects_v2({ bucket: bucket_name, max_keys: 1000, prefix: })
60✔
255
        responses = [resp]
54✔
256
        while resp.is_truncated
54✔
257
          resp = client.list_objects_v2({ bucket: bucket_name, max_keys: 1000, prefix:, continuation_token: resp.next_continuation_token })
20✔
258
          responses << resp
20✔
259
        end
260
        @s3_responses[key] = responses
54✔
261
      end
262
      responses
54✔
263
    end
264

265
    def get_s3_objects(bucket_name:, prefix:)
2✔
266
      start = Time.zone.now
60✔
267
      responses = s3_responses(bucket_name:, prefix:)
60✔
268
      objects = responses.reduce([]) do |all_objects, resp|
54✔
269
        resp_hash = resp.to_h
74✔
270
        resp_objects = parse_objects(resp_hash)
74✔
271
        all_objects + resp_objects
74✔
272
      end
273
      elapsed = Time.zone.now - start
54✔
274
      Rails.logger.info("Loading S3 objects. Bucket: #{bucket_name}. Prefix: #{prefix}. Elapsed: #{elapsed} seconds")
54✔
275
      objects
54✔
276
    end
277

278
    def parse_objects(resp)
2✔
279
      objects = []
74✔
280
      resp_hash = resp.to_h
74✔
281
      response_objects = resp_hash[:contents]
74✔
282
      response_objects&.each do |object|
74✔
283
        s3_file = S3File.new(work: model, filename: object[:key], last_modified: object[:last_modified], size: object[:size], checksum: object[:etag])
222✔
284
        objects << s3_file
222✔
285
      end
286
      objects
74✔
287
    end
288
end
289
# rubocop:enable Metrics/ClassLength
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc