0ab22b84-f1b6-4ce8-8341-38db8bfd0675

Committed 06 Jun 2024 05:03PM UTC coverage: 91.241% (-0.02%) from 91.26%

Build # 0ab22b84-f1b6-4ce8-8341-38db8bfd0675

Build Type

Pull #2388

circleci

Committed by

maxkadel

Commit Message

Start factoring out update full and incremental partner classes

Co-authored-by: Jane Sandberg <sandbergja@users.noreply.github.com>
Co-authored-by: Ryan Laddusaw <rladdusaw@users.noreply.github.com>
Co-authored-by: Winsice Ng <winsice-ng@users.noreply.github.com>

Pull Request Pull Request #2388: Do not process full dumps with private records

Run Details

54 of 59 new or added lines in 4 files covered. (91.53%)

9 existing lines in 1 file now uncovered.

3448 of 3779 relevant lines covered (91.24%)

348.3 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

93.84

/app/models/scsb/partner_updates.rb

require 'json'

module Scsb
  class PartnerUpdates
    # Full only
    def self.full(dump:)
      timestamp = DateTime.now.to_time
      dump_file_type = :recap_records_full
      new(dump:, timestamp:, dump_file_type:).process_full_files
    end

    # Incremental only
    def self.incremental(dump:, timestamp:)
      dump_file_type = :recap_records
      new(dump:, timestamp: timestamp.to_time, dump_file_type:).process_incremental_files
    end

    # Both
    def initialize(dump:, timestamp:, s3_bucket: Scsb::S3Bucket.partner_transfer_client, dump_file_type:)
      @dump = dump
      @s3_bucket = s3_bucket
      @update_directory = ENV['SCSB_PARTNER_UPDATE_DIRECTORY'] || '/tmp/updates'
      @scsb_file_dir = ENV['SCSB_FILE_DIR']
      @last_dump = timestamp
      @inv_xml = []
      @tab_newline = []
      @leader = []
      @composed_chars = []
      @bad_utf8 = []
      @dump_file_type = dump_file_type
    end

    # Full only
    def process_full_files
      prepare_directory
      download_and_process_full(inst: "NYPL", prefix: 'scsbfull_nypl_')
      download_and_process_full(inst: "CUL", prefix: 'scsbfull_cul_')
      download_and_process_full(inst: "HL", prefix: 'scsbfull_hl_')
      set_generated_date
      log_record_fixes
    end

    # Both
    def set_generated_date
      @dump.generated_date = date_strings.map { |str| DateTime.parse(str) }.sort.first
    end

    # Both
    def date_strings
      @dump.dump_files.map do |df|
        if df.dump_file_type == "recap_records_full_metadata"
          File.basename(df.path).split("_")[3]
        else
          File.basename(df.path).split("_")[2]
        end
      end
    end

    # Full only
    def download_and_process_full(inst:, prefix:)
      full_download = Scsb::PartnerUpdates::Full.new(s3_bucket: @s3_bucket, dump: @dump)
      return false unless full_download.validate_csv(inst:)

      matcher = /#{inst}.*\.zip/
      file = full_download.download_full_file(matcher)
      if file
        process_partner_updates(files: [file], file_prefix: prefix)
      else
        add_error(message: "No full dump files found matching #{inst}")
      end
    end

    # Full only
    # def validate_csv(inst:)
    #   matcher = /#{inst}.*\.csv/
    #   file = download_full_file(matcher)
    #   includes_private = ''
    #   if file
    #     csv = CSV.read(file, headers: true)
    #     includes_private = csv["Collection Group Id(s)"].first.include?('3')
    #     add_error(message: "Metadata file indicates that dump for #{inst} includes private records, not processing.") if includes_private
    #     filename = File.basename(file)
    #     destination_filepath = "#{@scsb_file_dir}/#{filename}"
    #     FileUtils.move(file, destination_filepath)
    #     attach_dump_file(destination_filepath, dump_file_type: :recap_records_full_metadata)
    #     File.unlink(destination_filepath) if File.exist?(destination_filepath)
    #   else
    #     add_error(message: "No metadata files found matching #{inst}")
    #   end
    #   !includes_private
    # end

    # Incremental only
    def process_incremental_files
      prepare_directory
      update_files = download_partner_updates
      process_partner_updates(files: update_files)
      set_generated_date
      log_record_fixes
      delete_files = download_partner_deletes
      process_partner_deletes(files: delete_files)
    end

    private

      # Both
      def add_error(message:)
        error = Array.wrap(@dump.event.error)
        error << message
        @dump.event.error = error.join("; ")
        @dump.event.save
      end

      # Incremental only
      def download_partner_updates
        file_list = @s3_bucket.list_files(prefix: ENV['SCSB_S3_PARTNER_UPDATES'] || 'data-exports/PUL/MARCXml/Incremental')
        @s3_bucket.download_files(files: file_list, timestamp_filter: @last_dump, output_directory: @update_directory)
      end

      # Incremental only
      def download_partner_deletes
        file_list = @s3_bucket.list_files(prefix: ENV['SCSB_S3_PARTNER_DELETES'] || 'data-exports/PUL/Json')
        @s3_bucket.download_files(files: file_list, timestamp_filter: @last_dump, output_directory: @update_directory)
      end

      # Full only
      # def download_full_file(file_filter)
      #   prefix = ENV['SCSB_S3_PARTNER_FULLS'] || 'data-exports/PUL/MARCXml/Full'
      #   @s3_bucket.download_recent(prefix:, output_directory: @update_directory, file_filter:)
      # end

      # Incremental only
      def process_partner_updates(files:, file_prefix: 'scsb_update_')
        xml_files = []
        files.each do |file|
          filename = File.basename(file, '.zip')
          filename.gsub!(/^[^_]+_([0-9]+)_([0-9]+).*$/, '\1_\2')
          file_increment = 1
          Zip::File.open(file) do |zip_file|
            zip_file.each do |entry|
              target = "#{@update_directory}/#{filename}_#{file_increment}.xml"
              xml_files << target
              entry.extract(target)
              file_increment += 1
            end
          end
          File.unlink(file)
        end
        xml_files.each do |file|
          filename = File.basename(file)
          reader = MARC::XMLReader.new(file.to_s, external_encoding: 'UTF-8')
          filepath = "#{@scsb_file_dir}/#{file_prefix}#{filename}"
          writer = MARC::XMLWriter.new(filepath)
          reader.each { |record| writer.write(process_record(record)) }
          writer.close
          File.unlink(file)
          attach_dump_file(filepath)
        end
      end

      # Incremental only
      def process_partner_deletes(files:)
        json_files = []
        files.each do |file|
          filename = File.basename(file, '.zip')
          file_increment = 1
          Zip::File.open(file) do |zip_file|
            zip_file.each do |entry|
              target = "#{@update_directory}/scsbdelete#{filename}_#{file_increment}.json"
              json_files << target
              entry.extract(target)
              file_increment += 1
            end
          end
          File.unlink(file)
        end
        ids = []
        json_files.each do |file|
          scsb_ids(file, ids)
          File.unlink(file)
        end
        @dump.delete_ids = ids
        @dump.save
      end

      # Incremental only (deletes)
      def scsb_ids(filename, ids)
        file = File.read(filename)
        data = JSON.parse(file)
        data.each do |record|
          ids << "SCSB-#{record['bib']['bibId']}"
        end
        ids
      end

      # Incremental only
      def process_record(record)
        record = field_delete(['856', '959'], record)
        record.leader[5] = 'c' if record.leader[5].eql?('d')
        if bad_utf8?(record)
          @bad_utf8 << record['001']
          record = bad_utf8_fix(record)
        end
        if invalid_xml_chars?(record)
          @inv_xml << record['001']
          record = invalid_xml_fix(record)
        end
        if tab_newline_char?(record)
          @tab_newline << record['001']
          record = tab_newline_fix(record)
        end
        if leader_errors?(record)
          @leader << record['001']
          record = leaderfix(record)
        end
        if composed_chars_errors?(record)
          @composed_chars << record['001']
          record = composed_chars_normalize(record)
        end
        record = extra_space_fix(record)
        empty_subfield_fix(record)
      end

      # Both
      def attach_dump_file(filepath, dump_file_type: nil)
        dump_file_type ||= @dump_file_type
        df = DumpFile.create(dump_file_type:, path: filepath)
        df.zip
        df.save
        @dump.dump_files << df
        @dump.save
      end

      # Both
      def log_record_fixes
        log_file = {
          inv_xml: @inv_xml,
          tab_newline: @tab_newline,
          leader: @leader,
          composed_chars: @composed_chars,
          bad_utf8: @bad_utf8
        }
        filepath = log_file_name
        File.write(filepath, log_file.to_json.to_s)
        attach_dump_file(filepath, dump_file_type: :log_file)
      end

      # Both
      def log_file_name
        "#{@scsb_file_dir}/fixes_#{@last_dump.strftime('%Y_%m_%d')}.json"
      end

      # Both
      def prepare_directory
        FileUtils.mkdir_p(@update_directory)
      end
  end
end

1	require 'json'	1✔
2
3	module Scsb	1✔
4	class PartnerUpdates	1✔
5	# Full only
6	def self.full(dump:)	1✔
7	timestamp = DateTime.now.to_time	6✔
8	dump_file_type = :recap_records_full	6✔
9	new(dump:, timestamp:, dump_file_type:).process_full_files	6✔
10	end
11
12	# Incremental only
13	def self.incremental(dump:, timestamp:)	1✔
14	dump_file_type = :recap_records	2✔
15	new(dump:, timestamp: timestamp.to_time, dump_file_type:).process_incremental_files	2✔
16	end
17
18	# Both
19	def initialize(dump:, timestamp:, s3_bucket: Scsb::S3Bucket.partner_transfer_client, dump_file_type:)	1✔
20	@dump = dump	9✔
21	@s3_bucket = s3_bucket	9✔
22	@update_directory = ENV['SCSB_PARTNER_UPDATE_DIRECTORY'] \|\| '/tmp/updates'	9✔
23	@scsb_file_dir = ENV['SCSB_FILE_DIR']	9✔
24	@last_dump = timestamp	9✔
25	@inv_xml = []	9✔
26	@tab_newline = []	9✔
27	@leader = []	9✔
28	@composed_chars = []	9✔
29	@bad_utf8 = []	9✔
30	@dump_file_type = dump_file_type	9✔
31	end
32
33	# Full only
34	def process_full_files	1✔
35	prepare_directory	6✔
36	download_and_process_full(inst: "NYPL", prefix: 'scsbfull_nypl_')	6✔
37	download_and_process_full(inst: "CUL", prefix: 'scsbfull_cul_')	5✔
38	download_and_process_full(inst: "HL", prefix: 'scsbfull_hl_')	5✔
39	set_generated_date	5✔
40	log_record_fixes	5✔
41	end
42
43	# Both
44	def set_generated_date	1✔
45	@dump.generated_date = date_strings.map { \|str\| DateTime.parse(str) }.sort.first	37✔
46	end
47
48	# Both
49	def date_strings	1✔
50	@dump.dump_files.map do \|df\|	7✔
51	if df.dump_file_type == "recap_records_full_metadata"	30✔
52	File.basename(df.path).split("_")[3]	10✔
53	else
54	File.basename(df.path).split("_")[2]	20✔
55	end
56	end
57	end
58
59	# Full only
60	def download_and_process_full(inst:, prefix:)	1✔
61	full_download = Scsb::PartnerUpdates::Full.new(s3_bucket: @s3_bucket, dump: @dump)	16✔
62	return false unless full_download.validate_csv(inst:)	16✔
63
64	matcher = /#{inst}.*\.zip/	8✔
65	file = full_download.download_full_file(matcher)	8✔
66	if file	8✔
67	process_partner_updates(files: [file], file_prefix: prefix)	8✔
68	else
NEW UNCOV 69	add_error(message: "No full dump files found matching #{inst}")	×
70	end
71	end
72
73	# Full only
74	# def validate_csv(inst:)
75	# matcher = /#{inst}.*\.csv/
76	# file = download_full_file(matcher)
77	# includes_private = ''
78	# if file
79	# csv = CSV.read(file, headers: true)
80	# includes_private = csv["Collection Group Id(s)"].first.include?('3')
81	# add_error(message: "Metadata file indicates that dump for #{inst} includes private records, not processing.") if includes_private
82	# filename = File.basename(file)
83	# destination_filepath = "#{@scsb_file_dir}/#{filename}"
84	# FileUtils.move(file, destination_filepath)
85	# attach_dump_file(destination_filepath, dump_file_type: :recap_records_full_metadata)
86	# File.unlink(destination_filepath) if File.exist?(destination_filepath)
87	# else
88	# add_error(message: "No metadata files found matching #{inst}")
89	# end
90	# !includes_private
91	# end
92
93	# Incremental only
94	def process_incremental_files	1✔
95	prepare_directory	2✔
96	update_files = download_partner_updates	2✔
97	process_partner_updates(files: update_files)	2✔
98	set_generated_date	2✔
99	log_record_fixes	2✔
100	delete_files = download_partner_deletes	2✔
101	process_partner_deletes(files: delete_files)	2✔
102	end
103
104	private	1✔
105
106	# Both
107	def add_error(message:)	1✔
NEW UNCOV 108	error = Array.wrap(@dump.event.error)	×
NEW UNCOV 109	error << message	×
NEW UNCOV 110	@dump.event.error = error.join("; ")	×
NEW UNCOV 111	@dump.event.save	×
112	end
113
114	# Incremental only
115	def download_partner_updates	1✔
116	file_list = @s3_bucket.list_files(prefix: ENV['SCSB_S3_PARTNER_UPDATES'] \|\| 'data-exports/PUL/MARCXml/Incremental')	2✔
117	@s3_bucket.download_files(files: file_list, timestamp_filter: @last_dump, output_directory: @update_directory)	2✔
118	end
119
120	# Incremental only
121	def download_partner_deletes	1✔
122	file_list = @s3_bucket.list_files(prefix: ENV['SCSB_S3_PARTNER_DELETES'] \|\| 'data-exports/PUL/Json')	2✔
123	@s3_bucket.download_files(files: file_list, timestamp_filter: @last_dump, output_directory: @update_directory)	2✔
124	end
125
126	# Full only
127	# def download_full_file(file_filter)
128	# prefix = ENV['SCSB_S3_PARTNER_FULLS'] \|\| 'data-exports/PUL/MARCXml/Full'
129	# @s3_bucket.download_recent(prefix:, output_directory: @update_directory, file_filter:)
130	# end
131
132	# Incremental only
133	def process_partner_updates(files:, file_prefix: 'scsb_update_')	1✔
134	xml_files = []	10✔
135	files.each do \|file\|	10✔
136	filename = File.basename(file, '.zip')	10✔
137	filename.gsub!(/^[^_]+_([0-9]+)_([0-9]+).*$/, '\1_\2')	10✔
138	file_increment = 1	10✔
139	Zip::File.open(file) do \|zip_file\|	10✔
140	zip_file.each do \|entry\|	10✔
141	target = "#{@update_directory}/#{filename}_#{file_increment}.xml"	20✔
142	xml_files << target	20✔
143	entry.extract(target)	20✔
144	file_increment += 1	20✔
145	end
146	end
147	File.unlink(file)	10✔
148	end
149	xml_files.each do \|file\|	10✔
150	filename = File.basename(file)	20✔
151	reader = MARC::XMLReader.new(file.to_s, external_encoding: 'UTF-8')	20✔
152	filepath = "#{@scsb_file_dir}/#{file_prefix}#{filename}"	20✔
153	writer = MARC::XMLWriter.new(filepath)	20✔
154	reader.each { \|record\| writer.write(process_record(record)) }	60✔
155	writer.close	20✔
156	File.unlink(file)	20✔
157	attach_dump_file(filepath)	20✔
158	end
159	end
160
161	# Incremental only
162	def process_partner_deletes(files:)	1✔
163	json_files = []	2✔
164	files.each do \|file\|	2✔
165	filename = File.basename(file, '.zip')	2✔
166	file_increment = 1	2✔
167	Zip::File.open(file) do \|zip_file\|	2✔
168	zip_file.each do \|entry\|	2✔
169	target = "#{@update_directory}/scsbdelete#{filename}_#{file_increment}.json"	2✔
170	json_files << target	2✔
171	entry.extract(target)	2✔
172	file_increment += 1	2✔
173	end
174	end
175	File.unlink(file)	2✔
176	end
177	ids = []	2✔
178	json_files.each do \|file\|	2✔
179	scsb_ids(file, ids)	2✔
180	File.unlink(file)	2✔
181	end
182	@dump.delete_ids = ids	2✔
183	@dump.save	2✔
184	end
185
186	# Incremental only (deletes)
187	def scsb_ids(filename, ids)	1✔
188	file = File.read(filename)	2✔
189	data = JSON.parse(file)	2✔
190	data.each do \|record\|	2✔
191	ids << "SCSB-#{record['bib']['bibId']}"	12✔
192	end
193	ids	2✔
194	end
195
196	# Incremental only
197	def process_record(record)	1✔
198	record = field_delete(['856', '959'], record)	41✔
199	record.leader[5] = 'c' if record.leader[5].eql?('d')	41✔
200	if bad_utf8?(record)	41✔
UNCOV 201	@bad_utf8 << record['001']	×
UNCOV 202	record = bad_utf8_fix(record)	×
203	end
204	if invalid_xml_chars?(record)	41✔
UNCOV 205	@inv_xml << record['001']	×
UNCOV 206	record = invalid_xml_fix(record)	×
207	end
208	if tab_newline_char?(record)	41✔
209	@tab_newline << record['001']	2✔
210	record = tab_newline_fix(record)	2✔
211	end
212	if leader_errors?(record)	41✔
213	@leader << record['001']	7✔
214	record = leaderfix(record)	7✔
215	end
216	if composed_chars_errors?(record)	41✔
217	@composed_chars << record['001']	14✔
218	record = composed_chars_normalize(record)	14✔
219	end
220	record = extra_space_fix(record)	41✔
221	empty_subfield_fix(record)	41✔
222	end
223
224	# Both
225	def attach_dump_file(filepath, dump_file_type: nil)	1✔
226	dump_file_type \|\|= @dump_file_type	27✔
227	df = DumpFile.create(dump_file_type:, path: filepath)	27✔
228	df.zip	27✔
229	df.save	27✔
230	@dump.dump_files << df	27✔
231	@dump.save	27✔
232	end
233
234	# Both
235	def log_record_fixes	1✔
236	log_file = {
237	inv_xml: @inv_xml,	7✔
238	tab_newline: @tab_newline,
239	leader: @leader,
240	composed_chars: @composed_chars,
241	bad_utf8: @bad_utf8
242	}
243	filepath = log_file_name	7✔
244	File.write(filepath, log_file.to_json.to_s)	7✔
245	attach_dump_file(filepath, dump_file_type: :log_file)	7✔
246	end
247
248	# Both
249	def log_file_name	1✔
250	"#{@scsb_file_dir}/fixes_#{@last_dump.strftime('%Y_%m_%d')}.json"	7✔
251	end
252
253	# Both
254	def prepare_directory	1✔
255	FileUtils.mkdir_p(@update_directory)	8✔
256	end
257	end
258	end

pulibrary / bibdata / 0ab22b84-f1b6-4ce8-8341-38db8bfd0675

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous