• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

pulibrary / pdc_discovery / 787041f3-bb1b-461b-9eaa-c391c9c1e54a

19 Feb 2025 03:50PM UTC coverage: 96.483% (-0.003%) from 96.486%
787041f3-bb1b-461b-9eaa-c391c9c1e54a

push

circleci

web-flow
Bump nokogiri from 1.16.8 to 1.18.3 (#756)

Bumps [nokogiri](https://github.com/sparklemotion/nokogiri) from 1.16.8 to 1.18.3.
- [Release notes](https://github.com/sparklemotion/nokogiri/releases)
- [Changelog](https://github.com/sparklemotion/nokogiri/blob/v1.18.3/CHANGELOG.md)
- [Commits](https://github.com/sparklemotion/nokogiri/compare/v1.16.8...v1.18.3)

---
updated-dependencies:
- dependency-name: nokogiri
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>

3649 of 3782 relevant lines covered (96.48%)

180.75 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

99.56
/spec/lib/describe_indexer_spec.rb
1
# frozen_string_literal: true
2
require "rails_helper"
1✔
3

4
# rubocop:disable Metrics/BlockLength
5
RSpec.describe DescribeIndexer do
1✔
6
  describe 'indexing a single record' do
1✔
7
    let(:single_item) { file_fixture("bitklavier_binaural.json").read }
21✔
8
    let(:indexer) do
1✔
9
      described_class.new(rss_url: "file://whatever.rss")
20✔
10
    end
11
    let(:indexed_record) do
1✔
12
      Blacklight.default_index.connection.delete_by_query("*:*")
20✔
13
      Blacklight.default_index.connection.commit
20✔
14
      indexer.index_one(single_item)
20✔
15
      response = Blacklight.default_index.connection.get 'select', params: { q: '*:*' }
20✔
16
      response["response"]["docs"].first
20✔
17
    end
18

19
    context "basic fields" do
1✔
20
      ##
21
      # The id is based on the DOI
22
      # A doi of 10.34770/r75s-9j74 will become doi-10-34770-r75s-9j74
23
      it "id" do
1✔
24
        expect(indexed_record["id"]).to eq "doi-10-34770-r75s-9j74"
1✔
25
      end
26

27
      it "stores a copy of the full JSON in CDATA" do
1✔
28
        stored_json = indexed_record["pdc_describe_json_ss"]
1✔
29
        parsed_json = JSON.parse(stored_json)
1✔
30
        expect(parsed_json["resource"]["titles"][0]["title"]).to eq "bitKlavier Grand Sample Library—Binaural Mic Image"
1✔
31
      end
32

33
      it "author" do
1✔
34
        expect(indexed_record["author_tesim"]).to eq ['Trueman, Daniel']
1✔
35
      end
36

37
      it "description" do
1✔
38
        description = "The bitKlavier Grand consists"
1✔
39
        expect(indexed_record["description_tsim"].first.include?(description)).to be true
1✔
40
      end
41

42
      it "contributors" do
1✔
43
        expect(indexed_record["contributor_tsim"].include?("Villalta, Andrés")).to eq true
1✔
44
        expect(indexed_record["contributor_tsim"].include?("Chou, Katie")).to eq true
1✔
45
        expect(indexed_record["contributor_tsim"].include?("Ayres, Christien")).to eq true
1✔
46
        expect(indexed_record["contributor_tsim"].include?("Wang, Matthew")).to eq true
1✔
47
      end
48

49
      it "title" do
1✔
50
        # title includes all titles
51
        main_title = "bitKlavier Grand Sample Library—Binaural Mic Image"
1✔
52
        alt_title = "alter title for bitKlavier"
1✔
53
        expect(indexed_record["title_tesim"].include?(main_title)).to eq true
1✔
54
        expect(indexed_record["title_tesim"].include?(alt_title)).to eq true
1✔
55
        # alt title does not include the main title
56
        expect(indexed_record["alternative_title_tesim"].include?(main_title)).to eq false
1✔
57
        expect(indexed_record["alternative_title_tesim"].include?(alt_title)).to eq true
1✔
58
      end
59

60
      it "rights" do
1✔
61
        expect(indexed_record["rights_name_ssi"]).to eq "GNU General Public License"
1✔
62
        expect(indexed_record["rights_uri_ssi"]).to eq "https://www.gnu.org/licenses/gpl-3.0.en.html"
1✔
63
      end
64

65
      it "keywords" do
1✔
66
        expect(indexed_record["subject_all_ssim"].include?("keyword1")).to eq true
1✔
67
        expect(indexed_record["subject_all_ssim"].include?("keyword2")).to eq true
1✔
68
        expect(indexed_record["subject_all_ssim"].include?("keyword3")).to eq true
1✔
69
      end
70

71
      it "collection tag" do
1✔
72
        expect(indexed_record["collection_tag_ssim"].include?("Humanities")).to eq true
1✔
73
        expect(indexed_record["collection_tag_ssim"].include?("Something else")).to eq true
1✔
74
      end
75

76
      it "community" do
1✔
77
        expect(indexed_record["community_name_ssi"]).to eq "Research Data"
1✔
78
      end
79

80
      it "genre_ssim" do
1✔
81
        expect(indexed_record["genre_ssim"].first).to eq "Dataset"
1✔
82
      end
83

84
      it "issue_date_ssim" do
1✔
85
        expect(indexed_record["issue_date_ssim"].first).to eq "2021"
1✔
86
      end
87

88
      # The pdc_created_at_dtsi field is used to describe when a record was created in PDC
89
      # This is used to sort the Recently Published page
90
      it "pdc_created_at_dtsi" do
1✔
91
        expect(indexed_record["pdc_created_at_dtsi"]).to eq "2021-12-31T19:00:00Z"
1✔
92
        # example timestamp 1997-12-31T23:59:59Z
93
      end
94

95
      it "pdc_updated_at_dtsi" do
1✔
96
        expect(indexed_record["pdc_updated_at_dtsi"]).to eq "2021-12-31T20:00:00Z"
1✔
97
      end
98

99
      it "issue_date_strict_ssi" do
1✔
100
        expect(indexed_record["issue_date_strict_ssi"]).to eq "2021-01-01"
1✔
101
      end
102

103
      it "publisher_ssim" do
1✔
104
        expect(indexed_record["publisher_ssim"].first).to eq "Princeton University"
1✔
105
      end
106

107
      it "migrated_bsi" do
1✔
108
        expect(indexed_record["migrated_bsi"]).to eq true
1✔
109
      end
110

111
      xit 'referenced_by' do
1✔
112
        expect(result['referenced_by_ssim'].first).to eq 'https://arxiv.org/abs/1903.06605'
×
113
      end
114
    end
115

116
    context "uris" do
1✔
117
      it "stores full URL for ARK and DOI" do
1✔
118
        expect(indexed_record["uri_ssim"].include?("http://arks.princeton.edu/ark:/88435/dsp015999n653h")).to eq true
1✔
119
        expect(indexed_record["uri_ssim"].include?("https://doi.org/10.34770/r75s-9j74")).to eq true
1✔
120
      end
121
    end
122

123
    context "files" do
1✔
124
      it "stores file detailed information" do
1✔
125
        files = JSON.parse(indexed_record['pdc_describe_json_ss'])['files']
1✔
126
        file1 = files.find { |file| file["filename"] == "10.80021/3m1k-6036/122/file1.jpg" }
2✔
127
        file2 = files.find { |file| file["filename"] == "10.80021/3m1k-6036/122/file2.txt" }
3✔
128
        file3 = files.find { |file| file["filename"] == "10.80021/3m1k-6036/122/folder-a/file3.txt" }
5✔
129
        expect(file1["size"]).to eq 316_781
1✔
130
        expect(file1["url"]).to eq "https://g-5beea4.90d4e.bd7c.data.globus.org/pdc-describe-staging-postcuration/10.80021/3m1k-6036/122/file1.jpg"
1✔
131
        expect(file2["size"]).to eq 396_003
1✔
132
        expect(file3["url"]).to eq "https://g-5beea4.90d4e.bd7c.data.globus.org/pdc-describe-staging-postcuration/10.80021/3m1k-6036/122/folder-a/file3.txt"
1✔
133
      end
134
    end
135

136
    context "all text catch all field" do
1✔
137
      it "indexes the file name in the all text catch all field" do
1✔
138
        files = JSON.parse(indexed_record['pdc_describe_json_ss'])['files']
1✔
139
        file_name = File.basename(files.first["filename"])
1✔
140
        response = Blacklight.default_index.connection.get 'select', params: { q: file_name }
1✔
141
        expect(response["response"]["numFound"]).to eq 1
1✔
142

143
        response = Blacklight.default_index.connection.get 'select', params: { q: "non-existing-value" }
1✔
144
        expect(response["response"]["numFound"]).to eq 0
1✔
145
      end
146
    end
147
  end
148

149
  describe 'indexing an RSS feed from PDC Describe' do
1✔
150
    let(:rss_feed) { file_fixture("works.rss").read }
4✔
151
    let(:resource1) { file_fixture("bitklavier_binaural.json").read }
9✔
152
    let(:resource2) { file_fixture("sowing_the_seeds.json").read }
9✔
153
    let(:rss_url_string) { "https://pdc-describe-prod.princeton.edu/describe/works.rss" }
5✔
154
    let(:indexer) { described_class.new(rss_url: rss_url_string) }
5✔
155

156
    it "has a traject indexer" do
1✔
157
      expect(indexer.traject_indexer).to be_instance_of Traject::Indexer::NokogiriIndexer
1✔
158
    end
159

160
    context 'indexing to solr' do
1✔
161
      before do
1✔
162
        Blacklight.default_index.connection.delete_by_query("*:*")
8✔
163
        Blacklight.default_index.connection.commit
8✔
164
        stub_request(:get, "https://pdc-describe-prod.princeton.edu/describe/works.rss")
8✔
165
          .to_return(status: 200, body: rss_feed, headers: {})
166
        stub_request(:get, "https://pdc-describe-prod.princeton.edu/describe/works/6.json")
8✔
167
          .to_return(status: 200, body: resource1, headers: {})
168
        stub_request(:get, "https://pdc-describe-prod.princeton.edu/describe/works/20.json")
8✔
169
          .to_return(status: 200, body: resource2, headers: {})
170
      end
171

172
      it "sends items to solr" do
1✔
173
        response = Blacklight.default_index.connection.get 'select', params: { q: '*:*' }
1✔
174
        expect(response["response"]["numFound"]).to eq 0
1✔
175

176
        # If index_pdc_describe == false, do not index PDC Describe.
177
        # This is a safety measure so we don't index in production until we're ready
178
        # See config/pdc_discovery.yml to change this setting for real
179
        Rails.configuration.pdc_discovery.index_pdc_describe = false
1✔
180
        indexer.index
1✔
181
        response = Blacklight.default_index.connection.get 'select', params: { q: '*:*' }
1✔
182
        expect(response["response"]["numFound"]).to eq 0
1✔
183

184
        # If index_pdc_describe == true, DO index PDC Describe.
185
        Rails.configuration.pdc_discovery.index_pdc_describe = true
1✔
186
        indexer.index
1✔
187
        response = Blacklight.default_index.connection.get 'select', params: { q: '*:*' }
1✔
188
        expect(response["response"]["numFound"]).to eq 2
1✔
189
      end
190

191
      it "can sort by issue_date_strict_ssi" do
1✔
192
        Rails.configuration.pdc_discovery.index_pdc_describe = true
1✔
193
        indexer.index
1✔
194
        response = Blacklight.default_index.connection.get 'select', params: { q: '*:*', sort: 'issue_date_strict_ssi desc' }
1✔
195
        expect(response["response"]["numFound"]).to eq 2
1✔
196
        expect(response["response"]['docs'].first['pdc_created_at_dtsi']).to eq("2023-07-11T11:06:10Z")
1✔
197
        expect(response["response"]['docs'].last['pdc_created_at_dtsi']).to eq("2021-12-31T19:00:00Z")
1✔
198
        expect(response["response"]['docs'].first["issue_date_strict_ssi"]).to eq "2023-01-01"
1✔
199
        expect(response["response"]['docs'].last["issue_date_strict_ssi"]).to eq "2021-01-01"
1✔
200
      end
201

202
      context "works with multiple creators" do
1✔
203
        let(:pppl1) { File.read(File.join(fixture_paths.first, 'files', 'pppl1.json')) }
2✔
204
        let(:pppl2) { File.read(File.join(fixture_paths.first, 'files', 'pppl2.json')) }
2✔
205
        before do
1✔
206
          stub_request(:get, "https://pdc-describe-prod.princeton.edu/describe/works/6.json")
1✔
207
            .to_return(status: 200, body: pppl1, headers: {})
208
          stub_request(:get, "https://pdc-describe-prod.princeton.edu/describe/works/20.json")
1✔
209
            .to_return(status: 200, body: pppl2, headers: {})
210
        end
211

212
        it "can sort by the first author" do
1✔
213
          Rails.configuration.pdc_discovery.index_pdc_describe = true
1✔
214
          indexer.index
1✔
215
          response = Blacklight.default_index.connection.get 'select', params: { q: '*:*', sort: 'author_si desc' }
1✔
216
          expect(response["response"]["numFound"]).to eq 2
1✔
217
          expect(response["response"]['docs'].first['author_tesim'].first).to eq("Wang, Yin")
1✔
218
          expect(response["response"]['docs'].last['author_tesim'].first).to eq("Schwartz, Jacob A.")
1✔
219
          expect(response["response"]['docs'].first["issue_date_strict_ssi"]).to eq "2021-12-30"
1✔
220
          expect(response["response"]['docs'].last["issue_date_strict_ssi"]).to eq "2022-01-01"
1✔
221
        end
222
      end
223

224
      context "when there are items which are under active embargo" do
1✔
225
        let(:item_file_fixture) { file_fixture("pdc_describe_active_embargo.json") }
3✔
226
        let(:embargo_resource) { item_file_fixture.read }
3✔
227
        # This redundancy is required for consistent testing
228
        let(:rss_feed) { file_fixture("works.rss").read }
3✔
229
        let(:rss_url) { "https://pdc-describe-prod.princeton.edu/describe/works.rss" }
3✔
230
        let(:indexer) do
1✔
231
          described_class.new(rss_url: rss_url)
2✔
232
        end
233
        let(:solr_response) do
1✔
234
          Blacklight.default_index.connection.get 'select', params: { q: '*:*' }
2✔
235
        end
236
        let(:response) { solr_response["response"] }
3✔
237
        let(:num_found) { response["numFound"] }
3✔
238
        let(:docs) { response["docs"] }
3✔
239
        let(:doc) { docs.first }
3✔
240
        let(:files) { JSON.parse(doc['pdc_describe_json_ss'])['files'] }
2✔
241

242
        before do
1✔
243
          stub_request(:get, rss_url)
2✔
244
            .to_return(status: 200, body: rss_feed, headers: {})
245
          stub_request(:get, "https://pdc-describe-prod.princeton.edu/describe/works/6.json")
2✔
246
            .to_return(status: 200, body: embargo_resource, headers: {})
247
          stub_request(:get, "https://pdc-describe-prod.princeton.edu/describe/works/20.json")
2✔
248
            .to_return(status: 200, body: embargo_resource, headers: {})
249

250
          Rails.configuration.pdc_discovery.index_pdc_describe = true
2✔
251
          indexer.index
2✔
252
        end
253

254
        it "indexes the embargo date" do
1✔
255
          expect(solr_response).to include("response")
1✔
256
          expect(response).to include("numFound")
1✔
257
          expect(num_found).to eq 1
1✔
258
          expect(docs).not_to be_empty
1✔
259
          expect(doc).to include("embargo_date_dtsi")
1✔
260
        end
261

262
        it "does not index the files" do
1✔
263
          expect(solr_response).to include("response")
1✔
264
          expect(response).to include("numFound")
1✔
265
          expect(num_found).to eq 1
1✔
266
          expect(docs).not_to be_empty
1✔
267
          expect(doc).to include("pdc_describe_json_ss")
1✔
268
          expect(files).to be_empty
1✔
269
        end
270
      end
271

272
      context "when there are items which are under an expired embargo" do
1✔
273
        let(:item_file_fixture) { file_fixture("pdc_describe_expired_embargo.json") }
2✔
274
        let(:embargo_resource) { item_file_fixture.read }
2✔
275
        # This redundancy is required for consistent testing
276
        let(:rss_feed) { file_fixture("works.rss").read }
2✔
277
        let(:rss_url) { "https://pdc-describe-prod.princeton.edu/describe/works.rss" }
2✔
278
        let(:indexer) do
1✔
279
          described_class.new(rss_url: rss_url)
1✔
280
        end
281
        let(:solr_response) do
1✔
282
          Blacklight.default_index.connection.get 'select', params: { q: '*:*' }
1✔
283
        end
284
        let(:response) { solr_response["response"] }
2✔
285
        let(:num_found) { response["numFound"] }
2✔
286
        let(:docs) { response["docs"] }
2✔
287
        let(:doc) { docs.first }
2✔
288
        let(:files) { doc['pdc_describe_json_ss']['files'] }
2✔
289

290
        before do
1✔
291
          stub_request(:get, rss_url)
1✔
292
            .to_return(status: 200, body: rss_feed, headers: {})
293
          stub_request(:get, "https://pdc-describe-prod.princeton.edu/describe/works/6.json")
1✔
294
            .to_return(status: 200, body: embargo_resource, headers: {})
295
          stub_request(:get, "https://pdc-describe-prod.princeton.edu/describe/works/20.json")
1✔
296
            .to_return(status: 200, body: embargo_resource, headers: {})
297

298
          Rails.configuration.pdc_discovery.index_pdc_describe = true
1✔
299
          indexer.index
1✔
300
        end
301

302
        it "does indexes the files" do
1✔
303
          expect(solr_response).to include("response")
1✔
304
          expect(response).to include("numFound")
1✔
305
          expect(num_found).to eq 1
1✔
306
          expect(docs).not_to be_empty
1✔
307
          expect(doc).to include("pdc_describe_json_ss")
1✔
308
          expect(files).not_to be_empty
1✔
309
        end
310
      end
311

312
      context "when there work is not under embargo" do
1✔
313
        let(:item_file_fixture) { file_fixture("pdc_describe_no_embargo.json") }
3✔
314
        let(:embargo_resource) { item_file_fixture.read }
3✔
315
        # This redundancy is required for consistent testing
316
        let(:rss_feed) { file_fixture("works.rss").read }
3✔
317
        let(:rss_url) { "https://pdc-describe-prod.princeton.edu/describe/works.rss" }
3✔
318
        let(:indexer) do
1✔
319
          described_class.new(rss_url: rss_url)
2✔
320
        end
321
        let(:solr_response) do
1✔
322
          Blacklight.default_index.connection.get 'select', params: { q: '*:*' }
2✔
323
        end
324
        let(:response) { solr_response["response"] }
3✔
325
        let(:num_found) { response["numFound"] }
3✔
326
        let(:docs) { response["docs"] }
3✔
327
        let(:doc) { docs.first }
3✔
328
        let(:files) { doc['pdc_describe_json_ss']['files'] }
2✔
329

330
        before do
1✔
331
          stub_request(:get, rss_url)
2✔
332
            .to_return(status: 200, body: rss_feed, headers: {})
333
          stub_request(:get, "https://pdc-describe-prod.princeton.edu/describe/works/6.json")
2✔
334
            .to_return(status: 200, body: embargo_resource, headers: {})
335
          stub_request(:get, "https://pdc-describe-prod.princeton.edu/describe/works/20.json")
2✔
336
            .to_return(status: 200, body: embargo_resource, headers: {})
337

338
          Rails.configuration.pdc_discovery.index_pdc_describe = true
2✔
339
          indexer.index
2✔
340
        end
341

342
        it "does not index an embargo date" do
1✔
343
          expect(solr_response).to include("response")
1✔
344
          expect(response).to include("numFound")
1✔
345
          expect(num_found).to eq 1
1✔
346
          expect(docs).not_to be_empty
1✔
347
          expect(doc).not_to include("embargo_date_dtsi")
1✔
348
        end
349

350
        it "does index the files" do
1✔
351
          expect(solr_response).to include("response")
1✔
352
          expect(response).to include("numFound")
1✔
353
          expect(num_found).to eq 1
1✔
354
          expect(docs).not_to be_empty
1✔
355
          expect(doc).to include("pdc_describe_json_ss")
1✔
356
          expect(files).not_to be_empty
1✔
357
        end
358
      end
359
    end
360
  end
361
end
362
# rubocop:enable Metrics/BlockLength
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc