2ee2c4fc-5ef0-4806-b86e-01bf70aa67a0

Committed 24 Dec 2024 04:55PM UTC coverage: 91.859% (-0.04%) from 91.902%

Build # 2ee2c4fc-5ef0-4806-b86e-01bf70aa67a0

Build Type

Pull #2569

circleci

Committed by

christinach

Commit Message

Generate new .rubocop_todo.yml
rubocop fix

Pull Request Pull Request #2569: Rubocop gems

Run Details

335 of 378 new or added lines in 57 files covered. (88.62%)

2 existing lines in 2 files now uncovered.

3385 of 3685 relevant lines covered (91.86%)

377.92 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

96.34

/marc_to_solr/lib/augment_the_subject.rb

# frozen_string_literal: true

require 'set'

##
# The creation and management of metadata are not neutral activities.
class AugmentTheSubject
  LCSH_TERMS_CSV_FILE = File.join(File.dirname(__FILE__), 'augment_the_subject', 'indigenous_studies.csv')
  # Can be re-created using `bundle exec rake augment:recreate_fixtures`
  LCSH_STANDALONE_A_FILE = File.join(File.dirname(__FILE__), 'augment_the_subject', 'standalone_subfield_a.json')
  # Must be created by hand from file provided by metadata librarians
  LCSH_STANDALONE_X_FILE = File.join(File.dirname(__FILE__), 'augment_the_subject', 'standalone_subfield_x.json')
  # Can be re-created using `bundle exec rake augment:recreate_fixtures`
  LCSH_REQUIRED_SUBFIELDS = File.join(File.dirname(__FILE__), 'augment_the_subject', 'indigenous_studies_required.json')

  ##
  # Ensure the needed config files exist
  def initialize
    raise "Cannot find lcsh csv file at #{LCSH_TERMS_CSV_FILE}" unless File.exist?(LCSH_TERMS_CSV_FILE)
    unless File.exist?(LCSH_STANDALONE_A_FILE)
      raise "Cannot find lcsh standalone subfield a file at #{LCSH_STANDALONE_A_FILE}"
    end
    unless File.exist?(LCSH_STANDALONE_X_FILE)
      raise "Cannot find lcsh standalone subfield x file at #{LCSH_STANDALONE_X_FILE}"
    end
    unless File.exist?(LCSH_REQUIRED_SUBFIELDS)
      raise "Cannot find lcsh required subfields file at #{LCSH_REQUIRED_SUBFIELDS}"
    end
  end

  def standalone_subfield_a_terms
    @standalone_subfield_a_terms ||= begin
      parsed_json = JSON.parse(File.read(LCSH_STANDALONE_A_FILE), { symbolize_names: true })
      parsed_json[:standalone_subfield_a].map do |term|
        normalize(term)
      end
    end
  end

  def standalone_subfield_x_terms
    @standalone_subfield_x_terms ||= begin
      parsed_json = JSON.parse(File.read(LCSH_STANDALONE_X_FILE), { symbolize_names: true })
      parsed_json[:standalone_subfield_x].map do |term|
        normalize(term)
      end
    end
  end

  def indigenous_studies_required
    @indigenous_studies_required ||= begin
      parsed_json = JSON.parse(File.read(LCSH_REQUIRED_SUBFIELDS), { symbolize_names: false })
      # Turns all the sub-arrays into sets for set comparison later
      parsed_json.transform_values! do |value|
        value.map do |val|
          val.map { |term| normalize(term) }.to_set
        end
      end
      # Normalizes and symbolizes key for fast and consistent retrieval
      parsed_json.transform_keys! do |key|
        normalize(key).to_sym
      end
    end
  end

  ##
  # Normalize lcsh terms so they can match at index time.
  # 1. downcase
  # 2. replace ǂ terms with SEPARATOR
  def normalize(lcsh_term)
    lcsh_term.chomp.downcase.gsub(/ ǂ. /, SEPARATOR)
  end

  ##
  # Given an array of terms, add "Indigenous Studies" if any of the terms match
  # @param [<String>] terms
  # @return [<String>]
  def add_indigenous_studies(terms)
    terms << 'Indigenous Studies' if indigenous_studies?(terms)
    terms
  end

  ##
  # Given an array of terms, check whether this set of terms should have an
  # additional subject heading of "Indigenous Studies" added
  # @param [<String>] terms
  # @return [Boolean]
  def indigenous_studies?(terms)
    terms.each do |term|
      next if term.blank?

      return true if subfield_a_match?(term)
      return true if subfield_x_match?(term)
      return true if subfield_a_with_required_subfields_match?(term)
    end
    false
  end

  ##
  # For some subject terms, only the first part needs to match.
  # E.g., "Quinnipiac Indians-History", "Quinnipiac Indians-Culture" should both
  # be assigned an Indigenous Studies term even though that entire term doesn't
  # appear in our terms list.
  def subfield_a_match?(term)
    subfield_a = normalize(term.split(SEPARATOR).first).gsub(/\.$/, '')
    standalone_subfield_a_terms.include?(subfield_a)
  end

  ##
  # For some subfield terms, only a single subfield needs to match.
  # E.g., any subject term that includes "Indian authors" should be assigned Indigenous Studies
  def subfield_x_match?(term)
    subfields = term.split(SEPARATOR)
    subfields = subfields.map { |subfield| normalize(subfield) }
    !(standalone_subfield_x_terms & subfields).empty?
  end

  ##
  # Some subject terms require a combination of terms in order to be assigned Indigenous Studies.
  # For example, "Alaska-Antiquities" should be a match, but "Alaska" by itself should not,
  # nor should "Antiquities" by itself.
  def subfield_a_with_required_subfields_match?(term)
    subfields = term.split(SEPARATOR)
    subfields = subfields.map { |subfield| normalize(subfield) }
    subfield_a = subfields.shift.to_sym

    required_subfields = indigenous_studies_required[subfield_a]
    return false unless required_subfields

    required_subfields.map do |req_terms|
      return true if req_terms.subset?(subfields.to_set)
    end
    false
  end

  # In order to re-write the fixture file based on a new CSV, run the rake task
  # `bundle exec rake augment:recreate_fixtures`
  def self.parse_standalone_a
    subfield_a_aggregator = Set.new
    CSV.foreach(LCSH_TERMS_CSV_FILE, headers: true) do |row|
      requires_subfield = row['With subdivisions ǂx etc.'] == 'y'
      unless requires_subfield
        lcsh_term = row['Term in MARC']
        subfield_a = lcsh_term.chomp.split('ǂ').first.strip
        subfield_a_aggregator << subfield_a
      end
    end
    output = {}
    output[:standalone_subfield_a] = subfield_a_aggregator.sort
    output
  end

  # In order to re-write the fixture file based on a new CSV, run the rake task
  # `bundle exec rake augment:recreate_fixtures`
  def self.parse_required_subfields
    output = {}
    CSV.foreach(LCSH_TERMS_CSV_FILE, headers: true) do |row|
      if row['With subdivisions ǂx etc.'] == 'y'
        term = row['Term in MARC']
        term_list = term.chomp.split(/ ǂ. /)
        subfield_a = term_list.shift
        if output[subfield_a]
          output[subfield_a] << term_list
        else
          output[subfield_a] = [term_list]
        end
      end
    end
    output.to_json
  end
end

1	# frozen_string_literal: true
2
3	require 'set'	1✔
4
5	##
6	# The creation and management of metadata are not neutral activities.
7	class AugmentTheSubject	1✔
8	LCSH_TERMS_CSV_FILE = File.join(File.dirname(__FILE__), 'augment_the_subject', 'indigenous_studies.csv')	1✔
9	# Can be re-created using `bundle exec rake augment:recreate_fixtures`
10	LCSH_STANDALONE_A_FILE = File.join(File.dirname(__FILE__), 'augment_the_subject', 'standalone_subfield_a.json')	1✔
11	# Must be created by hand from file provided by metadata librarians
12	LCSH_STANDALONE_X_FILE = File.join(File.dirname(__FILE__), 'augment_the_subject', 'standalone_subfield_x.json')	1✔
13	# Can be re-created using `bundle exec rake augment:recreate_fixtures`
14	LCSH_REQUIRED_SUBFIELDS = File.join(File.dirname(__FILE__), 'augment_the_subject', 'indigenous_studies_required.json')	1✔
15
16	##
17	# Ensure the needed config files exist
18	def initialize	1✔
19	raise "Cannot find lcsh csv file at #{LCSH_TERMS_CSV_FILE}" unless File.exist?(LCSH_TERMS_CSV_FILE)	34✔
20	unless File.exist?(LCSH_STANDALONE_A_FILE)	34✔
NEW 21	raise "Cannot find lcsh standalone subfield a file at #{LCSH_STANDALONE_A_FILE}"	×
22	end
23	unless File.exist?(LCSH_STANDALONE_X_FILE)	34✔
NEW 24	raise "Cannot find lcsh standalone subfield x file at #{LCSH_STANDALONE_X_FILE}"	×
25	end
26	unless File.exist?(LCSH_REQUIRED_SUBFIELDS)	34✔
NEW 27	raise "Cannot find lcsh required subfields file at #{LCSH_REQUIRED_SUBFIELDS}"	×
28	end
29	end
30
31	def standalone_subfield_a_terms	1✔
32	@standalone_subfield_a_terms \|\|= begin	1,308✔
33	parsed_json = JSON.parse(File.read(LCSH_STANDALONE_A_FILE), { symbolize_names: true })	26✔
34	parsed_json[:standalone_subfield_a].map do \|term\|	26✔
35	normalize(term)	145,574✔
36	end
37	end
38	end
39
40	def standalone_subfield_x_terms	1✔
41	@standalone_subfield_x_terms \|\|= begin	1,283✔
42	parsed_json = JSON.parse(File.read(LCSH_STANDALONE_X_FILE), { symbolize_names: true })	16✔
43	parsed_json[:standalone_subfield_x].map do \|term\|	16✔
44	normalize(term)	416✔
45	end
46	end
47	end
48
49	def indigenous_studies_required	1✔
50	@indigenous_studies_required \|\|= begin	1,279✔
51	parsed_json = JSON.parse(File.read(LCSH_REQUIRED_SUBFIELDS), { symbolize_names: false })	13✔
52	# Turns all the sub-arrays into sets for set comparison later
53	parsed_json.transform_values! do \|value\|	13✔
54	value.map do \|val\|	1,378✔
55	val.map { \|term\| normalize(term) }.to_set	5,148✔
56	end
57	end
58	# Normalizes and symbolizes key for fast and consistent retrieval
59	parsed_json.transform_keys! do \|key\|	13✔
60	normalize(key).to_sym	1,378✔
61	end
62	end
63	end
64
65	##
66	# Normalize lcsh terms so they can match at index time.
67	# 1. downcase
68	# 2. replace ǂ terms with SEPARATOR
69	def normalize(lcsh_term)	1✔
70	lcsh_term.chomp.downcase.gsub(/ ǂ. /, SEPARATOR)	156,247✔
71	end
72
73	##
74	# Given an array of terms, add "Indigenous Studies" if any of the terms match
75	# @param [<String>] terms
76	# @return [<String>]
77	def add_indigenous_studies(terms)	1✔
78	terms << 'Indigenous Studies' if indigenous_studies?(terms)	657✔
79	terms	657✔
80	end
81
82	##
83	# Given an array of terms, check whether this set of terms should have an
84	# additional subject heading of "Indigenous Studies" added
85	# @param [<String>] terms
86	# @return [Boolean]
87	def indigenous_studies?(terms)	1✔
88	terms.each do \|term\|	677✔
89	next if term.blank?	1,304✔
90
91	return true if subfield_a_match?(term)	1,303✔
92	return true if subfield_x_match?(term)	1,279✔
93	return true if subfield_a_with_required_subfields_match?(term)	1,276✔
94	end
95	false	645✔
96	end
97
98	##
99	# For some subject terms, only the first part needs to match.
100	# E.g., "Quinnipiac Indians-History", "Quinnipiac Indians-Culture" should both
101	# be assigned an Indigenous Studies term even though that entire term doesn't
102	# appear in our terms list.
103	def subfield_a_match?(term)	1✔
104	subfield_a = normalize(term.split(SEPARATOR).first).gsub(/\.$/, '')	1,306✔
105	standalone_subfield_a_terms.include?(subfield_a)	1,306✔
106	end
107
108	##
109	# For some subfield terms, only a single subfield needs to match.
110	# E.g., any subject term that includes "Indian authors" should be assigned Indigenous Studies
111	def subfield_x_match?(term)	1✔
112	subfields = term.split(SEPARATOR)	1,281✔
113	subfields = subfields.map { \|subfield\| normalize(subfield) }	3,533✔
114	!(standalone_subfield_x_terms & subfields).empty?	1,281✔
115	end
116
117	##
118	# Some subject terms require a combination of terms in order to be assigned Indigenous Studies.
119	# For example, "Alaska-Antiquities" should be a match, but "Alaska" by itself should not,
120	# nor should "Antiquities" by itself.
121	def subfield_a_with_required_subfields_match?(term)	1✔
122	subfields = term.split(SEPARATOR)	1,277✔
123	subfields = subfields.map { \|subfield\| normalize(subfield) }	3,516✔
124	subfield_a = subfields.shift.to_sym	1,277✔
125
126	required_subfields = indigenous_studies_required[subfield_a]	1,277✔
127	return false unless required_subfields	1,277✔
128
129	required_subfields.map do \|req_terms\|	47✔
130	return true if req_terms.subset?(subfields.to_set)	184✔
131	end
132	false	42✔
133	end
134
135	# In order to re-write the fixture file based on a new CSV, run the rake task
136	# `bundle exec rake augment:recreate_fixtures`
137	def self.parse_standalone_a	1✔
138	subfield_a_aggregator = Set.new	1✔
139	CSV.foreach(LCSH_TERMS_CSV_FILE, headers: true) do \|row\|	1✔
140	requires_subfield = row['With subdivisions ǂx etc.'] == 'y'	5,758✔
141	unless requires_subfield	5,758✔
142	lcsh_term = row['Term in MARC']	5,599✔
143	subfield_a = lcsh_term.chomp.split('ǂ').first.strip	5,599✔
144	subfield_a_aggregator << subfield_a	5,599✔
145	end
146	end
147	output = {}	1✔
148	output[:standalone_subfield_a] = subfield_a_aggregator.sort	1✔
149	output	1✔
150	end
151
152	# In order to re-write the fixture file based on a new CSV, run the rake task
153	# `bundle exec rake augment:recreate_fixtures`
154	def self.parse_required_subfields	1✔
155	output = {}	1✔
156	CSV.foreach(LCSH_TERMS_CSV_FILE, headers: true) do \|row\|	1✔
157	if row['With subdivisions ǂx etc.'] == 'y'	5,758✔
158	term = row['Term in MARC']	159✔
159	term_list = term.chomp.split(/ ǂ. /)	159✔
160	subfield_a = term_list.shift	159✔
161	if output[subfield_a]	159✔
162	output[subfield_a] << term_list	53✔
163	else
164	output[subfield_a] = [term_list]	106✔
165	end
166	end
167	end
168	output.to_json	1✔
169	end
170	end

pulibrary / bibdata / 2ee2c4fc-5ef0-4806-b86e-01bf70aa67a0

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous