f04bc944-f9b4-4a42-8b26-dcacd0e3e688

Committed 11 Mar 2025 10:27PM UTC coverage: 34.017% (-58.1%) from 92.162%

Build # f04bc944-f9b4-4a42-8b26-dcacd0e3e688

Build Type

Pull #2653

circleci

Committed by

christinach

Commit Message

Add new lc_subject_facet field.
Helps with the vocabulary work https://github.com/pulibrary/orangelight/pull/3386
In this new field we index only the lc subject heading and the subdivisions
So that when the user searches using the Details section, they can query solr for
all the subject headings and their divisions.

This is needed for the Subject browse Vocabulary work.
example: "lc_subject_facet": [
             "Booksellers and bookselling—Italy—Directories",
             "Booksellers and bookselling-Italy",
             "Booksellers and bookselling"
              ]

Pull Request Pull Request #2653: Add new lc_subject_facet field.

Run Details

1 of 3 new or added lines in 1 file covered. (33.33%)

2215 existing lines in 93 files now uncovered.

1294 of 3804 relevant lines covered (34.02%)

0.99 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

26.83

/marc_to_solr/lib/augment_the_subject.rb

# frozen_string_literal: true

require 'set'

##
# The creation and management of metadata are not neutral activities.
class AugmentTheSubject
  LCSH_TERMS_CSV_FILE = File.join(File.dirname(__FILE__), 'augment_the_subject', 'indigenous_studies.csv')
  # Can be re-created using `bundle exec rake augment:recreate_fixtures`
  LCSH_STANDALONE_A_FILE = File.join(File.dirname(__FILE__), 'augment_the_subject', 'standalone_subfield_a.json')
  # Must be created by hand from file provided by metadata librarians
  LCSH_STANDALONE_X_FILE = File.join(File.dirname(__FILE__), 'augment_the_subject', 'standalone_subfield_x.json')
  # Can be re-created using `bundle exec rake augment:recreate_fixtures`
  LCSH_REQUIRED_SUBFIELDS = File.join(File.dirname(__FILE__), 'augment_the_subject', 'indigenous_studies_required.json')

  ##
  # Ensure the needed config files exist
  def initialize
    raise "Cannot find lcsh csv file at #{LCSH_TERMS_CSV_FILE}" unless File.exist?(LCSH_TERMS_CSV_FILE)
    unless File.exist?(LCSH_STANDALONE_A_FILE)
      raise "Cannot find lcsh standalone subfield a file at #{LCSH_STANDALONE_A_FILE}"
    end
    unless File.exist?(LCSH_STANDALONE_X_FILE)
      raise "Cannot find lcsh standalone subfield x file at #{LCSH_STANDALONE_X_FILE}"
    end
    unless File.exist?(LCSH_REQUIRED_SUBFIELDS)
      raise "Cannot find lcsh required subfields file at #{LCSH_REQUIRED_SUBFIELDS}"
    end
  end

  def standalone_subfield_a_terms
    @standalone_subfield_a_terms ||= begin
      parsed_json = JSON.parse(File.read(LCSH_STANDALONE_A_FILE), { symbolize_names: true })
      parsed_json[:standalone_subfield_a].map do |term|
        normalize(term)
      end
    end
  end

  def standalone_subfield_x_terms
    @standalone_subfield_x_terms ||= begin
      parsed_json = JSON.parse(File.read(LCSH_STANDALONE_X_FILE), { symbolize_names: true })
      parsed_json[:standalone_subfield_x].map do |term|
        normalize(term)
      end
    end
  end

  def indigenous_studies_required
    @indigenous_studies_required ||= begin
      parsed_json = JSON.parse(File.read(LCSH_REQUIRED_SUBFIELDS), { symbolize_names: false })
      # Turns all the sub-arrays into sets for set comparison later
      parsed_json.transform_values! do |value|
        value.map do |val|
          val.map { |term| normalize(term) }.to_set
        end
      end
      # Normalizes and symbolizes key for fast and consistent retrieval
      parsed_json.transform_keys! do |key|
        normalize(key).to_sym
      end
    end
  end

  ##
  # Normalize lcsh terms so they can match at index time.
  # 1. downcase
  # 2. replace ǂ terms with SEPARATOR
  def normalize(lcsh_term)
    lcsh_term.chomp.downcase.gsub(/ ǂ. /, SEPARATOR)
  end

  ##
  # Given an array of terms, add "Indigenous Studies" if any of the terms match
  # @param [<String>] terms
  # @return [<String>]
  def add_indigenous_studies(terms)
    terms << 'Indigenous Studies' if indigenous_studies?(terms)
    terms
  end

  ##
  # Given an array of terms, check whether this set of terms should have an
  # additional subject heading of "Indigenous Studies" added
  # @param [<String>] terms
  # @return [Boolean]
  def indigenous_studies?(terms)
    terms.each do |term|
      next if term.blank?

      return true if subfield_a_match?(term)
      return true if subfield_x_match?(term)
      return true if subfield_a_with_required_subfields_match?(term)
    end
    false
  end

  ##
  # For some subject terms, only the first part needs to match.
  # E.g., "Quinnipiac Indians-History", "Quinnipiac Indians-Culture" should both
  # be assigned an Indigenous Studies term even though that entire term doesn't
  # appear in our terms list.
  def subfield_a_match?(term)
    subfield_a = normalize(term.split(SEPARATOR).first).gsub(/\.$/, '')
    standalone_subfield_a_terms.include?(subfield_a)
  end

  ##
  # For some subfield terms, only a single subfield needs to match.
  # E.g., any subject term that includes "Indian authors" should be assigned Indigenous Studies
  def subfield_x_match?(term)
    subfields = term.split(SEPARATOR)
    subfields = subfields.map { |subfield| normalize(subfield) }
    !(standalone_subfield_x_terms & subfields).empty?
  end

  ##
  # Some subject terms require a combination of terms in order to be assigned Indigenous Studies.
  # For example, "Alaska-Antiquities" should be a match, but "Alaska" by itself should not,
  # nor should "Antiquities" by itself.
  def subfield_a_with_required_subfields_match?(term)
    subfields = term.split(SEPARATOR)
    subfields = subfields.map { |subfield| normalize(subfield) }
    subfield_a = subfields.shift.to_sym

    required_subfields = indigenous_studies_required[subfield_a]
    return false unless required_subfields

    required_subfields.map do |req_terms|
      return true if req_terms.subset?(subfields.to_set)
    end
    false
  end

  # In order to re-write the fixture file based on a new CSV, run the rake task
  # `bundle exec rake augment:recreate_fixtures`
  def self.parse_standalone_a
    subfield_a_aggregator = Set.new
    CSV.foreach(LCSH_TERMS_CSV_FILE, headers: true) do |row|
      requires_subfield = row['With subdivisions ǂx etc.'] == 'y'
      unless requires_subfield
        lcsh_term = row['Term in MARC']
        subfield_a = lcsh_term.chomp.split('ǂ').first.strip
        subfield_a_aggregator << subfield_a
      end
    end
    output = {}
    output[:standalone_subfield_a] = subfield_a_aggregator.sort
    output
  end

  # In order to re-write the fixture file based on a new CSV, run the rake task
  # `bundle exec rake augment:recreate_fixtures`
  def self.parse_required_subfields
    output = {}
    CSV.foreach(LCSH_TERMS_CSV_FILE, headers: true) do |row|
      if row['With subdivisions ǂx etc.'] == 'y'
        term = row['Term in MARC']
        term_list = term.chomp.split(/ ǂ. /)
        subfield_a = term_list.shift
        if output[subfield_a]
          output[subfield_a] << term_list
        else
          output[subfield_a] = [term_list]
        end
      end
    end
    output.to_json
  end
end

1	# frozen_string_literal: true
2
3	require 'set'	1✔
4
5	##
6	# The creation and management of metadata are not neutral activities.
7	class AugmentTheSubject	1✔
8	LCSH_TERMS_CSV_FILE = File.join(File.dirname(__FILE__), 'augment_the_subject', 'indigenous_studies.csv')	1✔
9	# Can be re-created using `bundle exec rake augment:recreate_fixtures`
10	LCSH_STANDALONE_A_FILE = File.join(File.dirname(__FILE__), 'augment_the_subject', 'standalone_subfield_a.json')	1✔
11	# Must be created by hand from file provided by metadata librarians
12	LCSH_STANDALONE_X_FILE = File.join(File.dirname(__FILE__), 'augment_the_subject', 'standalone_subfield_x.json')	1✔
13	# Can be re-created using `bundle exec rake augment:recreate_fixtures`
14	LCSH_REQUIRED_SUBFIELDS = File.join(File.dirname(__FILE__), 'augment_the_subject', 'indigenous_studies_required.json')	1✔
15
16	##
17	# Ensure the needed config files exist
18	def initialize	1✔
19	raise "Cannot find lcsh csv file at #{LCSH_TERMS_CSV_FILE}" unless File.exist?(LCSH_TERMS_CSV_FILE)	1✔
20	unless File.exist?(LCSH_STANDALONE_A_FILE)	1✔
21	raise "Cannot find lcsh standalone subfield a file at #{LCSH_STANDALONE_A_FILE}"	×
22	end
23	unless File.exist?(LCSH_STANDALONE_X_FILE)	1✔
24	raise "Cannot find lcsh standalone subfield x file at #{LCSH_STANDALONE_X_FILE}"	×
25	end
26	unless File.exist?(LCSH_REQUIRED_SUBFIELDS)	1✔
27	raise "Cannot find lcsh required subfields file at #{LCSH_REQUIRED_SUBFIELDS}"	×
28	end
29	end
30
31	def standalone_subfield_a_terms	1✔
UNCOV 32	@standalone_subfield_a_terms \|\|= begin	×
UNCOV 33	parsed_json = JSON.parse(File.read(LCSH_STANDALONE_A_FILE), { symbolize_names: true })	×
UNCOV 34	parsed_json[:standalone_subfield_a].map do \|term\|	×
UNCOV 35	normalize(term)	×
36	end
37	end
38	end
39
40	def standalone_subfield_x_terms	1✔
UNCOV 41	@standalone_subfield_x_terms \|\|= begin	×
UNCOV 42	parsed_json = JSON.parse(File.read(LCSH_STANDALONE_X_FILE), { symbolize_names: true })	×
UNCOV 43	parsed_json[:standalone_subfield_x].map do \|term\|	×
UNCOV 44	normalize(term)	×
45	end
46	end
47	end
48
49	def indigenous_studies_required	1✔
UNCOV 50	@indigenous_studies_required \|\|= begin	×
UNCOV 51	parsed_json = JSON.parse(File.read(LCSH_REQUIRED_SUBFIELDS), { symbolize_names: false })	×
52	# Turns all the sub-arrays into sets for set comparison later
UNCOV 53	parsed_json.transform_values! do \|value\|	×
UNCOV 54	value.map do \|val\|	×
UNCOV 55	val.map { \|term\| normalize(term) }.to_set	×
56	end
57	end
58	# Normalizes and symbolizes key for fast and consistent retrieval
UNCOV 59	parsed_json.transform_keys! do \|key\|	×
UNCOV 60	normalize(key).to_sym	×
61	end
62	end
63	end
64
65	##
66	# Normalize lcsh terms so they can match at index time.
67	# 1. downcase
68	# 2. replace ǂ terms with SEPARATOR
69	def normalize(lcsh_term)	1✔
UNCOV 70	lcsh_term.chomp.downcase.gsub(/ ǂ. /, SEPARATOR)	×
71	end
72
73	##
74	# Given an array of terms, add "Indigenous Studies" if any of the terms match
75	# @param [<String>] terms
76	# @return [<String>]
77	def add_indigenous_studies(terms)	1✔
UNCOV 78	terms << 'Indigenous Studies' if indigenous_studies?(terms)	×
UNCOV 79	terms	×
80	end
81
82	##
83	# Given an array of terms, check whether this set of terms should have an
84	# additional subject heading of "Indigenous Studies" added
85	# @param [<String>] terms
86	# @return [Boolean]
87	def indigenous_studies?(terms)	1✔
UNCOV 88	terms.each do \|term\|	×
UNCOV 89	next if term.blank?	×
90
UNCOV 91	return true if subfield_a_match?(term)	×
UNCOV 92	return true if subfield_x_match?(term)	×
UNCOV 93	return true if subfield_a_with_required_subfields_match?(term)	×
94	end
UNCOV 95	false	×
96	end
97
98	##
99	# For some subject terms, only the first part needs to match.
100	# E.g., "Quinnipiac Indians-History", "Quinnipiac Indians-Culture" should both
101	# be assigned an Indigenous Studies term even though that entire term doesn't
102	# appear in our terms list.
103	def subfield_a_match?(term)	1✔
UNCOV 104	subfield_a = normalize(term.split(SEPARATOR).first).gsub(/\.$/, '')	×
UNCOV 105	standalone_subfield_a_terms.include?(subfield_a)	×
106	end
107
108	##
109	# For some subfield terms, only a single subfield needs to match.
110	# E.g., any subject term that includes "Indian authors" should be assigned Indigenous Studies
111	def subfield_x_match?(term)	1✔
UNCOV 112	subfields = term.split(SEPARATOR)	×
UNCOV 113	subfields = subfields.map { \|subfield\| normalize(subfield) }	×
UNCOV 114	!(standalone_subfield_x_terms & subfields).empty?	×
115	end
116
117	##
118	# Some subject terms require a combination of terms in order to be assigned Indigenous Studies.
119	# For example, "Alaska-Antiquities" should be a match, but "Alaska" by itself should not,
120	# nor should "Antiquities" by itself.
121	def subfield_a_with_required_subfields_match?(term)	1✔
UNCOV 122	subfields = term.split(SEPARATOR)	×
UNCOV 123	subfields = subfields.map { \|subfield\| normalize(subfield) }	×
UNCOV 124	subfield_a = subfields.shift.to_sym	×
125
UNCOV 126	required_subfields = indigenous_studies_required[subfield_a]	×
UNCOV 127	return false unless required_subfields	×
128
UNCOV 129	required_subfields.map do \|req_terms\|	×
UNCOV 130	return true if req_terms.subset?(subfields.to_set)	×
131	end
UNCOV 132	false	×
133	end
134
135	# In order to re-write the fixture file based on a new CSV, run the rake task
136	# `bundle exec rake augment:recreate_fixtures`
137	def self.parse_standalone_a	1✔
UNCOV 138	subfield_a_aggregator = Set.new	×
UNCOV 139	CSV.foreach(LCSH_TERMS_CSV_FILE, headers: true) do \|row\|	×
UNCOV 140	requires_subfield = row['With subdivisions ǂx etc.'] == 'y'	×
UNCOV 141	unless requires_subfield	×
UNCOV 142	lcsh_term = row['Term in MARC']	×
UNCOV 143	subfield_a = lcsh_term.chomp.split('ǂ').first.strip	×
UNCOV 144	subfield_a_aggregator << subfield_a	×
145	end
146	end
UNCOV 147	output = {}	×
UNCOV 148	output[:standalone_subfield_a] = subfield_a_aggregator.sort	×
UNCOV 149	output	×
150	end
151
152	# In order to re-write the fixture file based on a new CSV, run the rake task
153	# `bundle exec rake augment:recreate_fixtures`
154	def self.parse_required_subfields	1✔
UNCOV 155	output = {}	×
UNCOV 156	CSV.foreach(LCSH_TERMS_CSV_FILE, headers: true) do \|row\|	×
UNCOV 157	if row['With subdivisions ǂx etc.'] == 'y'	×
UNCOV 158	term = row['Term in MARC']	×
UNCOV 159	term_list = term.chomp.split(/ ǂ. /)	×
UNCOV 160	subfield_a = term_list.shift	×
UNCOV 161	if output[subfield_a]	×
UNCOV 162	output[subfield_a] << term_list	×
163	else
UNCOV 164	output[subfield_a] = [term_list]	×
165	end
166	end
167	end
UNCOV 168	output.to_json	×
169	end
170	end

pulibrary / bibdata / f04bc944-f9b4-4a42-8b26-dcacd0e3e688

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous