• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

pulibrary / pdc_describe / e9529ebc-53b1-4951-a698-9e2a3bf71118

15 Apr 2025 02:35PM UTC coverage: 95.343% (-0.08%) from 95.426%
e9529ebc-53b1-4951-a698-9e2a3bf71118

Pull #2094

circleci

web-flow
Merge branch 'main' into 2055-move-files-to-embargo-bucket
Pull Request #2094: Move files to embargo bucket for approved embargoed works

24 of 28 new or added lines in 4 files covered. (85.71%)

3 existing lines in 1 file now uncovered.

3521 of 3693 relevant lines covered (95.34%)

389.51 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

92.37
/app/services/s3_query_service.rb
1
# frozen_string_literal: true
2

3
require "aws-sdk-s3"
2✔
4

5
# A service to query an S3 bucket for information about a given data set
6
# rubocop:disable Metrics/ClassLength
7
class S3QueryService
2✔
8
  attr_reader :model
2✔
9

10
  attr_reader :part_size, :last_response, :s3client
2✔
11

12
  delegate "pre_curation?", "post_curation?", :bucket_name, :region, :client, to: :s3client
2✔
13

14
  ##
15
  # @param [Work] model
16
  # @param [String] mode Valid values are PULS3Client::PRECURATION, PULS3Client::POSTCURATION
17
  #                          PULS3Client::PRESERVATION, and PULS3Client::EMBARGO.
18
  #                      This value controls the AWS S3 bucket used to access the files.
19
  # @example S3QueryService.new(Work.find(1), "precuration")
20
  def initialize(model, mode = PULS3Client::PRECURATION, bucket_name: nil)
2✔
21
    @model = model
104✔
22
    @doi = model.doi
104✔
23
    @s3client = PULS3Client.new(mode, bucket_name:)
104✔
24
    @part_size = 5_368_709_120 # 5GB is the maximum part size for AWS
104✔
25
    @last_response = nil
104✔
26
    @s3_responses = {}
104✔
27
  end
28

29
  ##
30
  # The S3 prefix for this object, i.e., the address within the S3 bucket,
31
  # which is based on the DOI
32
  def prefix
2✔
33
    "#{@doi}/#{model.id}/"
140✔
34
  end
35

36
  ##
37
  # Public signed URL to fetch this file from the S3 (valid for a limited time)
38
  def file_url(key)
2✔
39
    signer = Aws::S3::Presigner.new(client:)
2✔
40
    signer.presigned_url(:get_object, bucket: bucket_name, key:)
2✔
41
  end
42

43
  # required, accepts ETag, Checksum, ObjectParts, StorageClass, ObjectSize
44
  def self.object_attributes
2✔
45
    [
18✔
46
      "ETag",
47
      "Checksum",
48
      "ObjectParts",
49
      "StorageClass",
50
      "ObjectSize"
51
    ]
52
  end
53

54
  def get_s3_object_attributes(key:)
2✔
55
    response = client.get_object_attributes({
18✔
56
                                              bucket: bucket_name,
57
                                              key:,
58
                                              object_attributes: self.class.object_attributes
59
                                            })
60
    response.to_h
18✔
61
  end
62

63
  # Retrieve the S3 resources uploaded to the S3 Bucket
64
  # @return [Array<S3File>]
65
  def client_s3_files(reload: false, bucket_name: self.bucket_name, prefix: self.prefix)
2✔
66
    if reload # force a reload
50✔
67
      @client_s3_files = nil
22✔
68
      clear_s3_responses(bucket_name:, prefix:)
22✔
69
    end
70
    @client_s3_files ||= get_s3_objects(bucket_name:, prefix:)
50✔
71
  end
72

73
  def client_s3_empty_files(reload: false, bucket_name: self.bucket_name, prefix: self.prefix)
2✔
74
    if reload # force a reload
18✔
75
      @client_s3_empty_files = nil
12✔
76
      clear_s3_responses(bucket_name:, prefix:)
12✔
77
    end
78
    @client_s3_empty_files ||= begin
18✔
79
      files_and_directories = get_s3_objects(bucket_name:, prefix:)
16✔
80
      files_and_directories.select(&:empty?)
16✔
81
    end
82
  end
83

84
  ##
85
  # Query the S3 bucket for what we know about the doi
86
  # For docs see:
87
  # * https://docs.aws.amazon.com/sdk-for-ruby/v3/api/Aws/S3/Client.html#list_objects_v2-instance_method
88
  # * https://docs.aws.amazon.com/sdk-for-ruby/v3/api/Aws/S3/Client.html#get_object_attributes-instance_method
89
  # @return Hash with two properties {objects: [<S3File>], ok: Bool}
90
  #   objects is an Array of S3File objects
91
  #   ok is false if there is an error connecting to S3. Otherwise true.
92
  def data_profile
2✔
93
    { objects: client_s3_files, ok: true }
12✔
94
  rescue => ex
95
    Rails.logger.error("Error querying S3. Bucket: #{bucket_name}. DOI: #{@doi}. Exception: #{ex.message}")
4✔
96

97
    { objects: [], ok: false }
4✔
98
  end
99

100
  ##
101
  # Copies the existing files from the pre-curation bucket to the target bucket (postcuration or embargo).
102
  # Notice that the copy process happens at AWS (i.e. the files are not downloaded and re-uploaded).
103
  def publish_files(current_user)
2✔
104
    source_bucket = PULS3Client.pre_curation_config[:bucket]
8✔
105
    target_bucket = if model.embargoed?
8✔
NEW
106
                      PULS3Client.embargo_config[:bucket]
×
107
                    else
108
                      PULS3Client.post_curation_config[:bucket]
8✔
109
                    end
110

111
    empty_files = client_s3_empty_files(reload: true, bucket_name: source_bucket)
8✔
112
    # Do not move the empty files, however, ensure that it is noted that the
113
    #   presence of empty files is specified in the provenance log.
114
    unless empty_files.empty?
8✔
115
      empty_files.each do |empty_file|
8✔
116
        message = "Warning: Attempted to publish empty S3 file #{empty_file.filename}."
10✔
117
        WorkActivity.add_work_activity(model.id, message, current_user.id, activity_type: WorkActivity::SYSTEM)
10✔
118
      end
119
    end
120

121
    files = client_s3_files(reload: true, bucket_name: source_bucket)
8✔
122
    snapshot = ApprovedUploadSnapshot.new(work: model)
8✔
123
    snapshot.store_files(files, current_user:)
8✔
124
    snapshot.save
8✔
125
    files.each do |file|
8✔
126
      ApprovedFileMoveJob.perform_later(work_id: model.id, source_bucket:, source_key: file.key, target_bucket:,
26✔
127
                                        target_key: file.key, size: file.size, snapshot_id: snapshot.id)
128
    end
129
    true
8✔
130
  end
131

132
  def copy_file(source_key:, target_bucket:, target_key:, size:)
2✔
133
    Rails.logger.info("Copying #{source_key} to #{target_bucket}/#{target_key}")
18✔
134
    if size > part_size
18✔
135
      copy_multi_part(source_key:, target_bucket:, target_key:, size:)
12✔
136
    else
137
      client.copy_object(copy_source: source_key.gsub("+", "%2B"), bucket: target_bucket, key: target_key, checksum_algorithm: "SHA256")
6✔
138
    end
139
  rescue Aws::Errors::ServiceError => aws_service_error
140
    message = "An error was encountered when requesting to copy AWS S3 Object from #{source_key} to #{target_key} in the bucket #{target_bucket}: #{aws_service_error}"
×
141
    Rails.logger.error(message)
×
142
    raise aws_service_error
×
143
  end
144

145
  def copy_multi_part(source_key:, target_bucket:, target_key:, size:)
2✔
146
    multi = client.create_multipart_upload(bucket: target_bucket, key: target_key, checksum_algorithm: "SHA256")
12✔
147
    part_num = 0
12✔
148
    start_byte = 0
12✔
149
    parts = []
12✔
150
    while start_byte < size
12✔
151
      part_num += 1
24✔
152
      end_byte = [start_byte + part_size, size].min - 1
24✔
153
      resp = client.upload_part_copy(bucket: target_bucket, copy_source: source_key, key: multi.key, part_number: part_num,
24✔
154
                                     upload_id: multi.upload_id, copy_source_range: "bytes=#{start_byte}-#{end_byte}")
155
      parts << { etag: resp.copy_part_result.etag, part_number: part_num, checksum_sha256: resp.copy_part_result.checksum_sha256 }
24✔
156
      start_byte = end_byte + 1
24✔
157
    end
158
    client.complete_multipart_upload(bucket: target_bucket, key: target_key, upload_id: multi.upload_id, multipart_upload: { parts: })
12✔
159
  rescue Aws::Errors::ServiceError => aws_service_error
160
    message = "An error was encountered when requesting to multipart copy AWS S3 Object from #{source_key} to #{target_key} in the bucket #{target_bucket}: #{aws_service_error}"
×
161
    Rails.logger.error(message)
×
162
    raise aws_service_error
×
163
  end
164

165
  def delete_s3_object(s3_file_key, bucket: bucket_name)
2✔
166
    resp = client.delete_object({ bucket:, key: s3_file_key })
14✔
167
    resp.to_h
12✔
168
  rescue Aws::Errors::ServiceError => aws_service_error
169
    message = "An error was encountered when requesting to delete the AWS S3 Object #{s3_file_key} in the bucket #{bucket_name}: #{aws_service_error}"
2✔
170
    Rails.logger.error(message)
2✔
171
    raise aws_service_error
2✔
172
  end
173

174
  def create_directory
2✔
175
    client.put_object({ bucket: bucket_name, key: prefix, content_length: 0 })
4✔
176
  rescue Aws::Errors::ServiceError => aws_service_error
177
    message = "An error was encountered when requesting to create the AWS S3 directory Object in the bucket #{bucket_name} with the key #{prefix}: #{aws_service_error}"
2✔
178
    Rails.logger.error(message)
2✔
179
    raise aws_service_error
2✔
180
  end
181

182
  def upload_file(io:, filename:, size:, md5_digest: nil)
2✔
183
    key = "#{prefix}#{filename}"
2✔
184
    if s3client.upload_file(io:, target_key: key, size:, md5_digest:)
2✔
185
      key
2✔
186
    end
187
  end
188

189
  def check_file(bucket:, key:)
2✔
190
    client.head_object({ bucket:, key: })
14✔
191
  rescue Aws::Errors::ServiceError => aws_service_error
192
    message = "An error was encountered when requesting to check the status of the AWS S3 Object in the bucket #{bucket} with the key #{key}: #{aws_service_error}"
×
193
    Rails.logger.error(message)
×
194
    raise aws_service_error
×
195
  end
196

197
  private
2✔
198

199
    def clear_s3_responses(bucket_name:, prefix:)
2✔
200
      key = "#{bucket_name} #{prefix}"
34✔
201
      @s3_responses[key] = nil
34✔
202
    end
203

204
    def s3_responses(bucket_name:, prefix:)
2✔
205
      key = "#{bucket_name} #{prefix}"
56✔
206
      responses = @s3_responses[key]
56✔
207
      if responses.nil?
56✔
208
        resp = client.list_objects_v2({ bucket: bucket_name, max_keys: 1000, prefix: })
56✔
209
        responses = [resp]
52✔
210
        while resp.is_truncated
52✔
211
          resp = client.list_objects_v2({ bucket: bucket_name, max_keys: 1000, prefix:, continuation_token: resp.next_continuation_token })
20✔
212
          responses << resp
20✔
213
        end
214
        @s3_responses[key] = responses
52✔
215
      end
216
      responses
52✔
217
    end
218

219
    def get_s3_objects(bucket_name:, prefix:)
2✔
220
      start = Time.zone.now
56✔
221
      responses = s3_responses(bucket_name:, prefix:)
56✔
222
      objects = responses.reduce([]) do |all_objects, resp|
52✔
223
        resp_hash = resp.to_h
72✔
224
        resp_objects = parse_objects(resp_hash)
72✔
225
        all_objects + resp_objects
72✔
226
      end
227
      elapsed = Time.zone.now - start
52✔
228
      Rails.logger.info("Loading S3 objects. Bucket: #{bucket_name}. Prefix: #{prefix}. Elapsed: #{elapsed} seconds")
52✔
229
      objects
52✔
230
    end
231

232
    def parse_objects(resp)
2✔
233
      objects = []
72✔
234
      resp_hash = resp.to_h
72✔
235
      response_objects = resp_hash[:contents]
72✔
236
      response_objects&.each do |object|
72✔
237
        s3_file = S3File.new(work: model, filename: object[:key], last_modified: object[:last_modified], size: object[:size], checksum: object[:etag])
216✔
238
        objects << s3_file
216✔
239
      end
240
      objects
72✔
241
    end
242
end
243
# rubocop:enable Metrics/ClassLength
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc