• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

pulibrary / bibdata / 0ab22b84-f1b6-4ce8-8341-38db8bfd0675

06 Jun 2024 05:03PM UTC coverage: 91.241% (-0.02%) from 91.26%
0ab22b84-f1b6-4ce8-8341-38db8bfd0675

Pull #2388

circleci

maxkadel
Start factoring out update full and incremental partner classes

Co-authored-by: Jane Sandberg <sandbergja@users.noreply.github.com>
Co-authored-by: Ryan Laddusaw <rladdusaw@users.noreply.github.com>
Co-authored-by: Winsice Ng <winsice-ng@users.noreply.github.com>
Pull Request #2388: Do not process full dumps with private records

54 of 59 new or added lines in 4 files covered. (91.53%)

9 existing lines in 1 file now uncovered.

3448 of 3779 relevant lines covered (91.24%)

348.3 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

93.84
/app/models/scsb/partner_updates.rb
1
require 'json'
1✔
2

3
module Scsb
1✔
4
  class PartnerUpdates
1✔
5
    # Full only
6
    def self.full(dump:)
1✔
7
      timestamp = DateTime.now.to_time
6✔
8
      dump_file_type = :recap_records_full
6✔
9
      new(dump:, timestamp:, dump_file_type:).process_full_files
6✔
10
    end
11

12
    # Incremental only
13
    def self.incremental(dump:, timestamp:)
1✔
14
      dump_file_type = :recap_records
2✔
15
      new(dump:, timestamp: timestamp.to_time, dump_file_type:).process_incremental_files
2✔
16
    end
17

18
    # Both
19
    def initialize(dump:, timestamp:, s3_bucket: Scsb::S3Bucket.partner_transfer_client, dump_file_type:)
1✔
20
      @dump = dump
9✔
21
      @s3_bucket = s3_bucket
9✔
22
      @update_directory = ENV['SCSB_PARTNER_UPDATE_DIRECTORY'] || '/tmp/updates'
9✔
23
      @scsb_file_dir = ENV['SCSB_FILE_DIR']
9✔
24
      @last_dump = timestamp
9✔
25
      @inv_xml = []
9✔
26
      @tab_newline = []
9✔
27
      @leader = []
9✔
28
      @composed_chars = []
9✔
29
      @bad_utf8 = []
9✔
30
      @dump_file_type = dump_file_type
9✔
31
    end
32

33
    # Full only
34
    def process_full_files
1✔
35
      prepare_directory
6✔
36
      download_and_process_full(inst: "NYPL", prefix: 'scsbfull_nypl_')
6✔
37
      download_and_process_full(inst: "CUL", prefix: 'scsbfull_cul_')
5✔
38
      download_and_process_full(inst: "HL", prefix: 'scsbfull_hl_')
5✔
39
      set_generated_date
5✔
40
      log_record_fixes
5✔
41
    end
42

43
    # Both
44
    def set_generated_date
1✔
45
      @dump.generated_date = date_strings.map { |str| DateTime.parse(str) }.sort.first
37✔
46
    end
47

48
    # Both
49
    def date_strings
1✔
50
      @dump.dump_files.map do |df|
7✔
51
        if df.dump_file_type == "recap_records_full_metadata"
30✔
52
          File.basename(df.path).split("_")[3]
10✔
53
        else
54
          File.basename(df.path).split("_")[2]
20✔
55
        end
56
      end
57
    end
58

59
    # Full only
60
    def download_and_process_full(inst:, prefix:)
1✔
61
      full_download = Scsb::PartnerUpdates::Full.new(s3_bucket: @s3_bucket, dump: @dump)
16✔
62
      return false unless full_download.validate_csv(inst:)
16✔
63

64
      matcher = /#{inst}.*\.zip/
8✔
65
      file = full_download.download_full_file(matcher)
8✔
66
      if file
8✔
67
        process_partner_updates(files: [file], file_prefix: prefix)
8✔
68
      else
NEW
UNCOV
69
        add_error(message: "No full dump files found matching #{inst}")
×
70
      end
71
    end
72

73
    # Full only
74
    # def validate_csv(inst:)
75
    #   matcher = /#{inst}.*\.csv/
76
    #   file = download_full_file(matcher)
77
    #   includes_private = ''
78
    #   if file
79
    #     csv = CSV.read(file, headers: true)
80
    #     includes_private = csv["Collection Group Id(s)"].first.include?('3')
81
    #     add_error(message: "Metadata file indicates that dump for #{inst} includes private records, not processing.") if includes_private
82
    #     filename = File.basename(file)
83
    #     destination_filepath = "#{@scsb_file_dir}/#{filename}"
84
    #     FileUtils.move(file, destination_filepath)
85
    #     attach_dump_file(destination_filepath, dump_file_type: :recap_records_full_metadata)
86
    #     File.unlink(destination_filepath) if File.exist?(destination_filepath)
87
    #   else
88
    #     add_error(message: "No metadata files found matching #{inst}")
89
    #   end
90
    #   !includes_private
91
    # end
92

93
    # Incremental only
94
    def process_incremental_files
1✔
95
      prepare_directory
2✔
96
      update_files = download_partner_updates
2✔
97
      process_partner_updates(files: update_files)
2✔
98
      set_generated_date
2✔
99
      log_record_fixes
2✔
100
      delete_files = download_partner_deletes
2✔
101
      process_partner_deletes(files: delete_files)
2✔
102
    end
103

104
    private
1✔
105

106
      # Both
107
      def add_error(message:)
1✔
NEW
UNCOV
108
        error = Array.wrap(@dump.event.error)
×
NEW
UNCOV
109
        error << message
×
NEW
UNCOV
110
        @dump.event.error = error.join("; ")
×
NEW
UNCOV
111
        @dump.event.save
×
112
      end
113

114
      # Incremental only
115
      def download_partner_updates
1✔
116
        file_list = @s3_bucket.list_files(prefix: ENV['SCSB_S3_PARTNER_UPDATES'] || 'data-exports/PUL/MARCXml/Incremental')
2✔
117
        @s3_bucket.download_files(files: file_list, timestamp_filter: @last_dump, output_directory: @update_directory)
2✔
118
      end
119

120
      # Incremental only
121
      def download_partner_deletes
1✔
122
        file_list = @s3_bucket.list_files(prefix: ENV['SCSB_S3_PARTNER_DELETES'] || 'data-exports/PUL/Json')
2✔
123
        @s3_bucket.download_files(files: file_list, timestamp_filter: @last_dump, output_directory: @update_directory)
2✔
124
      end
125

126
      # Full only
127
      # def download_full_file(file_filter)
128
      #   prefix = ENV['SCSB_S3_PARTNER_FULLS'] || 'data-exports/PUL/MARCXml/Full'
129
      #   @s3_bucket.download_recent(prefix:, output_directory: @update_directory, file_filter:)
130
      # end
131

132
      # Incremental only
133
      def process_partner_updates(files:, file_prefix: 'scsb_update_')
1✔
134
        xml_files = []
10✔
135
        files.each do |file|
10✔
136
          filename = File.basename(file, '.zip')
10✔
137
          filename.gsub!(/^[^_]+_([0-9]+)_([0-9]+).*$/, '\1_\2')
10✔
138
          file_increment = 1
10✔
139
          Zip::File.open(file) do |zip_file|
10✔
140
            zip_file.each do |entry|
10✔
141
              target = "#{@update_directory}/#{filename}_#{file_increment}.xml"
20✔
142
              xml_files << target
20✔
143
              entry.extract(target)
20✔
144
              file_increment += 1
20✔
145
            end
146
          end
147
          File.unlink(file)
10✔
148
        end
149
        xml_files.each do |file|
10✔
150
          filename = File.basename(file)
20✔
151
          reader = MARC::XMLReader.new(file.to_s, external_encoding: 'UTF-8')
20✔
152
          filepath = "#{@scsb_file_dir}/#{file_prefix}#{filename}"
20✔
153
          writer = MARC::XMLWriter.new(filepath)
20✔
154
          reader.each { |record| writer.write(process_record(record)) }
60✔
155
          writer.close
20✔
156
          File.unlink(file)
20✔
157
          attach_dump_file(filepath)
20✔
158
        end
159
      end
160

161
      # Incremental only
162
      def process_partner_deletes(files:)
1✔
163
        json_files = []
2✔
164
        files.each do |file|
2✔
165
          filename = File.basename(file, '.zip')
2✔
166
          file_increment = 1
2✔
167
          Zip::File.open(file) do |zip_file|
2✔
168
            zip_file.each do |entry|
2✔
169
              target = "#{@update_directory}/scsbdelete#{filename}_#{file_increment}.json"
2✔
170
              json_files << target
2✔
171
              entry.extract(target)
2✔
172
              file_increment += 1
2✔
173
            end
174
          end
175
          File.unlink(file)
2✔
176
        end
177
        ids = []
2✔
178
        json_files.each do |file|
2✔
179
          scsb_ids(file, ids)
2✔
180
          File.unlink(file)
2✔
181
        end
182
        @dump.delete_ids = ids
2✔
183
        @dump.save
2✔
184
      end
185

186
      # Incremental only (deletes)
187
      def scsb_ids(filename, ids)
1✔
188
        file = File.read(filename)
2✔
189
        data = JSON.parse(file)
2✔
190
        data.each do |record|
2✔
191
          ids << "SCSB-#{record['bib']['bibId']}"
12✔
192
        end
193
        ids
2✔
194
      end
195

196
      # Incremental only
197
      def process_record(record)
1✔
198
        record = field_delete(['856', '959'], record)
41✔
199
        record.leader[5] = 'c' if record.leader[5].eql?('d')
41✔
200
        if bad_utf8?(record)
41✔
UNCOV
201
          @bad_utf8 << record['001']
×
UNCOV
202
          record = bad_utf8_fix(record)
×
203
        end
204
        if invalid_xml_chars?(record)
41✔
UNCOV
205
          @inv_xml << record['001']
×
UNCOV
206
          record = invalid_xml_fix(record)
×
207
        end
208
        if tab_newline_char?(record)
41✔
209
          @tab_newline << record['001']
2✔
210
          record = tab_newline_fix(record)
2✔
211
        end
212
        if leader_errors?(record)
41✔
213
          @leader << record['001']
7✔
214
          record = leaderfix(record)
7✔
215
        end
216
        if composed_chars_errors?(record)
41✔
217
          @composed_chars << record['001']
14✔
218
          record = composed_chars_normalize(record)
14✔
219
        end
220
        record = extra_space_fix(record)
41✔
221
        empty_subfield_fix(record)
41✔
222
      end
223

224
      # Both
225
      def attach_dump_file(filepath, dump_file_type: nil)
1✔
226
        dump_file_type ||= @dump_file_type
27✔
227
        df = DumpFile.create(dump_file_type:, path: filepath)
27✔
228
        df.zip
27✔
229
        df.save
27✔
230
        @dump.dump_files << df
27✔
231
        @dump.save
27✔
232
      end
233

234
      # Both
235
      def log_record_fixes
1✔
236
        log_file = {
237
          inv_xml: @inv_xml,
7✔
238
          tab_newline: @tab_newline,
239
          leader: @leader,
240
          composed_chars: @composed_chars,
241
          bad_utf8: @bad_utf8
242
        }
243
        filepath = log_file_name
7✔
244
        File.write(filepath, log_file.to_json.to_s)
7✔
245
        attach_dump_file(filepath, dump_file_type: :log_file)
7✔
246
      end
247

248
      # Both
249
      def log_file_name
1✔
250
        "#{@scsb_file_dir}/fixes_#{@last_dump.strftime('%Y_%m_%d')}.json"
7✔
251
      end
252

253
      # Both
254
      def prepare_directory
1✔
255
        FileUtils.mkdir_p(@update_directory)
8✔
256
      end
257
  end
258
end
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc