• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

pulibrary / bibdata / 3165b919-56f4-4faa-baf4-b5a08621230c

18 Sep 2025 03:08PM UTC coverage: 90.866% (+0.2%) from 90.675%
3165b919-56f4-4faa-baf4-b5a08621230c

Pull #2927

circleci

Ryan Laddusaw
Add QA DSpace config
Pull Request #2927: Index theses from dspace 7+

1411 of 1519 new or added lines in 16 files covered. (92.89%)

41 existing lines in 7 files now uncovered.

8874 of 9766 relevant lines covered (90.87%)

338.18 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

85.87
/lib/bibdata_rs/src/theses/dataspace/collection.rs
1
// This module is responsible for interacting with Dataspace collections
2
// using the Dataspace JSON API
3

4
use std::{
5
    fs::File,
6
    io::{BufWriter, Write},
7
};
8

9
use crate::solr::SolrDocument;
10
use crate::theses::{
11
    config,
12
    dataspace::{community, document::DataspaceDocument},
13
    temp_theses_cache_path,
14
};
15
use anyhow::{anyhow, Result};
16
use log::debug;
17
use magnus::exception;
18
use rayon::prelude::*;
19
use serde::Deserialize;
20

21
#[derive(Clone, Debug, Default, Deserialize)]
22
pub struct SearchResponse {
23
    pub _embedded: SearchEmbedded,
24
}
25

26
#[derive(Clone, Debug, Default, Deserialize)]
27
#[serde(rename_all =  "camelCase")]
28
pub struct SearchEmbedded {
29
    pub search_result: SearchResult,
30
}
31

32
#[derive(Clone, Debug, Default, Deserialize)]
33
pub struct SearchResult {
34
    pub _embedded: ResultEmbedded,
35
    pub page: Page,
36
}
37

38
#[derive(Clone, Debug, Default, Deserialize)]
39
pub struct ResultEmbedded {
40
    pub objects: Vec<Item>,
41
}
42

43
#[derive(Clone, Debug, Default, Deserialize)]
44
#[serde(rename_all =  "camelCase")]
45
pub struct Page {
46
    number: i32,
47
    total_pages: i32,
48
}
49

50
#[derive(Clone, Debug, Default, Deserialize)]
51
pub struct Item {
52
    pub _embedded: ItemEmbedded,
53
}
54

55
#[derive(Clone, Debug, Default, Deserialize)]
56
#[serde(rename_all =  "camelCase")]
57
pub struct ItemEmbedded {
58
    pub indexable_object: DataspaceDocument,
59
}
60

61
pub fn collection_url(server: &str, scope: &str, page_size: &str, page: &str) -> String {
7✔
62
    format!(
7✔
63
        "{}/discover/search/objects?scope={}&size={}&page={}",
7✔
64
        server, scope, page_size, page
65
    )
66
}
7✔
67

68
fn magnus_err_from_serde_err(value: &serde_json::Error) -> magnus::Error {
×
69
    magnus::Error::new(exception::runtime_error(), value.to_string())
×
70
}
×
71

72
fn magnus_err_from_anyhow_err(value: &anyhow::Error) -> magnus::Error {
1✔
73
    magnus::Error::new(exception::runtime_error(), value.to_string())
1✔
74
}
1✔
75

76
// The main function for thesis caching, to be called from Ruby
77
pub fn collections_as_solr(
1✔
78
    server: String,
1✔
79
    community_handle: String,
1✔
80
    rest_limit: u32,
1✔
81
) -> Result<(), magnus::Error> {
1✔
82
    env_logger::init();
1✔
83
    let documents: Vec<DataspaceDocument> =
×
84
        get_document_list(&server, &community_handle, rest_limit, |server, handle| {
1✔
85
            community::get_collection_list(server, handle, community::get_community_id)
1✔
86
        })
1✔
87
        .map_err(|e| magnus_err_from_anyhow_err(&e))?;
1✔
NEW
88
    let file = File::create(temp_theses_cache_path())
×
89
        .map_err(|value| magnus_err_from_anyhow_err(&anyhow!(value)))?;
×
90
    let mut writer = BufWriter::new(file);
×
91
    serde_json::to_writer_pretty(
×
92
        &mut writer,
×
93
        &documents
×
94
            .iter()
×
95
            .map(SolrDocument::from)
×
96
            .collect::<Vec<SolrDocument>>(),
×
97
    )
98
    .map_err(|e| magnus_err_from_serde_err(&e))?;
×
99
    writer
×
100
        .flush()
×
101
        .map_err(|value| magnus_err_from_anyhow_err(&anyhow!(value)))?;
×
102
    Ok(())
×
103
}
1✔
104

105
type CollectionIdsSelector = fn(&str, &str) -> Result<Vec<String>>;
106
pub fn get_document_list(
3✔
107
    server: &str,
3✔
108
    community_id: &str,
3✔
109
    rest_limit: u32,
3✔
110
    ids_selector: CollectionIdsSelector, // a closure that returns a Vec of dspace collection ids
3✔
111
) -> Result<Vec<DataspaceDocument>> {
3✔
112
    let collection_ids = ids_selector(server, community_id)?;
3✔
113
    let documents = collection_ids
2✔
114
        .par_iter()
2✔
115
        .try_fold(Vec::new, |mut accumulator, collection_id| {
2✔
116
            get_documents_in_collection(
2✔
117
                &mut accumulator,
2✔
118
                server,
2✔
119
                collection_id.clone(),
2✔
120
                rest_limit,
2✔
121
                0,
122
                0,
123
            )?;
1✔
124
            Ok::<Vec<DataspaceDocument>, anyhow::Error>(accumulator)
1✔
125
        })
2✔
126
        .try_reduce(Vec::new, |mut a, b| {
2✔
127
            a.extend(b);
1✔
128
            Ok(a)
1✔
129
        })?;
1✔
130

131
    Ok(documents)
1✔
132
}
3✔
133

134
// This function recursively fetches paginated API results and retries on error
135
fn get_documents_in_collection(
6✔
136
    documents: &mut Vec<DataspaceDocument>,
6✔
137
    server: &str,
6✔
138
    scope: String,
6✔
139
    page_size: u32,
6✔
140
    page: u32,
6✔
141
    attempt: u8,
6✔
142
) -> Result<Vec<DataspaceDocument>> {
6✔
143
    let url = collection_url(
6✔
144
        server,
6✔
145
        &scope,
6✔
146
        &page_size.to_string(),
6✔
147
        &page.to_string(),
6✔
148
    );
149
    if attempt == 0 {
6✔
150
        debug!("Querying for the DSpace Collection at {}", url)
3✔
151
    } else {
152
        debug!(
3✔
153
            "Retrying query {}, attempt {} of {}",
×
154
            url,
155
            attempt,
156
            config::THESES_RETRY_ATTEMPTS
157
        );
158
    }
159
    let search_response = get_url_as_json(&url);
6✔
160
    let pagination: Page;
161

162
    let mut new_documents = match search_response {
6✔
163
        Ok(docs) => {
2✔
164
            pagination = docs.clone()._embedded.search_result.page;
2✔
165
            Ok(map_search_result_to_vec(docs))
2✔
166
        },
167
        Err(e) => {
4✔
168
            // If there was an error, increment the count of attempts and recurse
169
            pagination = Page {
4✔
170
                number: 0,
4✔
171
                total_pages: 0,
4✔
172
            };
4✔
173
            if attempt < config::THESES_RETRY_ATTEMPTS {
4✔
174
                get_documents_in_collection(
3✔
175
                    documents,
3✔
176
                    server,
3✔
177
                    scope.clone(),
3✔
178
                    page_size,
3✔
179
                    page,
3✔
180
                    attempt + 1,
3✔
181
                )
182
            } else {
183
                Err(e)
1✔
184
            }
185
        }
186
    }?;
4✔
187
    if !new_documents.is_empty() {
2✔
188
        documents.append(&mut new_documents);
1✔
189
    }
1✔
190
    // If the current page is not the last page get the next page of documents
191
    if pagination.number + 1 < pagination.total_pages {
2✔
UNCOV
192
        get_documents_in_collection(
×
UNCOV
193
            documents,
×
UNCOV
194
            server,
×
NEW
195
            scope,
×
NEW
196
            page_size,
×
NEW
197
            page + 1,
×
NEW
198
            0)?;
×
199
    }
2✔
200
    Ok(vec![])
2✔
201
}
6✔
202

203
fn get_url_as_json(url: &str) -> Result<SearchResponse> {
6✔
204
    reqwest::blocking::get(url)?
6✔
205
        .json()
6✔
206
        .map_err(|e| anyhow!("Could not parse json at {url}: {e:?}"))
6✔
207
}
6✔
208

209
fn map_search_result_to_vec(search_response: SearchResponse) -> Vec<DataspaceDocument> {
2✔
210
    search_response._embedded.search_result._embedded.objects.iter().map(|obj| { obj.clone()._embedded.indexable_object }).collect()
20✔
211
}
2✔
212

213
#[cfg(test)]
214
mod tests {
215
    use rb_sys_test_helpers::ruby_test;
216

217
    use super::*;
218

219
    #[test]
220
    fn it_creates_a_collection_url() {
1✔
221
        assert_eq!(collection_url(
1✔
222
            "https://theses-dissertations.princeton.edu/server/api",
1✔
223
            "d98b1985-fc36-47ce-b11a-62386b505e85",
1✔
224
            "100",
1✔
225
            "10"
1✔
226
        ),
227
        "https://theses-dissertations.princeton.edu/server/api/discover/search/objects?scope=d98b1985-fc36-47ce-b11a-62386b505e85&size=100&page=10");
228
    }
1✔
229

230
    #[test]
231
    fn it_fetches_the_documents_from_the_community() {
1✔
232
        let mut server = mockito::Server::new();
1✔
233
        let mock_page0 = server
1✔
234
            .mock(
1✔
235
                "GET",
1✔
236
                "/discover/search/objects?scope=ace6dfbf-4f73-4558-acd0-1c4e5fd94baa&size=20&page=0",
1✔
237
            )
1✔
238
            .with_status(200)
1✔
239
            .with_body_from_file("../../spec/fixtures/files/theses/api_client_search.json")
1✔
240
            .create();
1✔
241

242
        let ids_selector: CollectionIdsSelector = |_server: &str, _handle: &str| Ok(vec!["ace6dfbf-4f73-4558-acd0-1c4e5fd94baa".to_string()]);
1✔
243
        let docs =
1✔
244
            get_document_list(&server.url(), "c5839e02-b833-4db1-a92f-92a1ffd286b9", 20, ids_selector).unwrap();
1✔
245
        assert_eq!(docs.len(), 20);
1✔
246
        assert_eq!(
1✔
247
            docs[0].title.clone().unwrap(),
1✔
248
            vec![
1✔
249
                Some("Charging Ahead, Left Behind?\nBalancing Local Labor Market Trade-Offs of Recent U.S. Power Plant Retirements and Renewable Energy Expansion"
1✔
250
                    .to_owned())
1✔
251
            ]
252
        );
253

254
        mock_page0.assert();
1✔
255
    }
1✔
256

257
    #[test]
258
    fn requests_past_pagination_limit_return_no_results() {
1✔
259
        let mut server = mockito::Server::new();
1✔
260
        let mock_page1 = server
1✔
261
            .mock(
1✔
262
                "GET",
1✔
263
                "/discover/search/objects?scope=ace6dfbf-4f73-4558-acd0-1c4e5fd94baa&size=20&page=1",
1✔
264
            )
1✔
265
            .with_status(200)
1✔
266
            .with_body_from_file("../../spec/fixtures/files/theses/api_client_search_page_1.json")
1✔
267
            .create();
1✔
268

269
        let mut docs: Vec<DataspaceDocument> = vec![];
1✔
270
        let _ = get_documents_in_collection(
1✔
271
            &mut docs,
1✔
272
            &server.url(),
1✔
273
            "ace6dfbf-4f73-4558-acd0-1c4e5fd94baa".to_string(),
1✔
274
            20,
1✔
275
            1,
1✔
276
            0
1✔
277
        );
1✔
278

279
        assert_eq!(docs.len(), 0);
1✔
280
        mock_page1.assert();
1✔
281
    }
1✔
282

283
    #[test]
284
    fn it_retries_requests_when_500_errors() {
1✔
285
        let mut server = mockito::Server::new();
1✔
286
        let mock_page1 = server
1✔
287
            .mock(
1✔
288
                "GET",
1✔
289
                "/discover/search/objects?scope=d98b1985-fc36-47ce-b11a-62386b505e85&size=100&page=0",
1✔
290
            )
1✔
291
            .with_status(500)
1✔
292
            .expect(4) // The initial request + 3 retries
1✔
293
            .create();
1✔
294

295
        let ids_selector: CollectionIdsSelector = |_server: &str, _handle: &str| Ok(vec!["d98b1985-fc36-47ce-b11a-62386b505e85".to_string()]);
1✔
296
        let docs = get_document_list(&server.url(), "88435/dsp019c67wm88m", 100, ids_selector);
1✔
297
        assert!(docs.is_err());
1✔
298

299
        mock_page1.assert();
1✔
300
    }
1✔
301

302
    #[ruby_test]
303
    fn it_notifies_ruby_of_errors() {
304
        let mut server = mockito::Server::new();
305
        let mock_bad_response = server
306
            .mock("GET", "/core/communities/")
307
            .with_status(500)
308
            .create();
309

310
        assert!(collections_as_solr(server.url(), "88435/dsp019c67wm88m".to_owned(), 100).is_err());
311
        mock_bad_response.assert();
312
    }
313
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc