4f06df0c-5da3-4707-bade-63cea0efc8d5

from os import listdir
from os.path import isfile, join
import dedupe
import psycopg2
import psycopg2.extras
from config import settings
from src.marc_to_db import MarcToDb
from src.machine_learning_model import MachineLearningModel
from src.readable import Readable

RECORD_SELECT = """SELECT
                    id, title, author, publication_year, pagination, edition, publisher_name, type_of, is_electronic_resource
                    FROM records;
                """


class DbDedupeRecords(MachineLearningModel):
    def __init__(self, input_directory, output_directory, match_threshold=0.5):
        super().__init__(output_directory, match_threshold)
        for path in listdir(input_directory):
            full_path = join(input_directory, path)
            if isfile(full_path):
                # save to database
                MarcToDb(full_path).to_db()
        self.read_con = psycopg2.connect(
            database=settings.db_name,
            user=settings.db_user,
            host=settings.db_host,
            port=settings.db_port,
            cursor_factory=psycopg2.extras.RealDictCursor,
        )
        self.write_con = psycopg2.connect(
            database=settings.db_name,
            user=settings.db_user,
            host=settings.db_host,
            port=settings.db_port,
        )

    def deduper(self):
        try:
            with open(self.settings_file_path, "rb") as sf:
                print("reading from", self.settings_file_path)
                model = dedupe.StaticDedupe(sf)
        except FileNotFoundError:
            model = dedupe.Dedupe(self.fields())
            model = self.train_and_write_model(model)

        return model

    def prepare_training(self, model):
        with self.read_con.cursor("donor_select") as cur:
            cur.execute(RECORD_SELECT)
            # temp_d = {i: row for i, row in enumerate(cur)}
            temp_d = dict(enumerate(cur))
        try:
            with open(self.training_file_path, encoding="utf-8") as tf:
                print(f'Loading training data from {self.training_file_path} - you can skip console label if you would like')
                return model.prepare_training(temp_d, training_file=tf)
        except FileNotFoundError:
            return model.prepare_training(temp_d)

        del temp_d

    def train_and_write_model(self, model):
        self.prepare_training(model)
        self.console_label(model)
        model.train()
        # When finished, save our training away to disk
        self.write_training(model)
        self.write_settings(model)
        # Remove memory intensive objects used for training
        model.cleanup_training()
        return model

    def block(self, model):
        print("blocking...")
        print("creating blocking_map table")
        with self.write_con:
            with self.write_con.cursor() as cur:
                cur.execute("DROP TABLE IF EXISTS blocking_map")
                cur.execute("CREATE TABLE blocking_map (block_key text, id TEXT)")
        print("creating inverted index")
        for field in model.fingerprinter.index_fields:
            with self.read_con.cursor("field_values") as cur:
                cur.execute(f"SELECT DISTINCT {field} FROM records")
                field_data = (row[field] for row in cur)
                model.fingerprinter.index(field_data, field)

        print("writing blocking map")
        with self.read_con.cursor("donor_select") as read_cur:
            read_cur.execute(RECORD_SELECT)

            full_data = ((row["id"], row) for row in read_cur)
            b_data = model.fingerprinter(full_data)

            with self.write_con:
                with self.write_con.cursor() as write_cur:
                    write_cur.copy_expert(
                        "COPY blocking_map FROM STDIN WITH CSV",
                        Readable(b_data),
                        size=10000,
                    )

        model.fingerprinter.reset_indices()
        print("indexing block_key")
        with self.write_con:
            with self.write_con.cursor() as cur:
                cur.execute(
                    "CREATE UNIQUE INDEX ON blocking_map "
                    "(block_key text_pattern_ops, id)"
                )

    def cluster(self, model):
        with self.write_con:
            with self.write_con.cursor() as cur:
                cur.execute("DROP TABLE IF EXISTS entity_map")

                print("creating entity_map database")
                cur.execute(
                    "CREATE TABLE entity_map "
                    "(id TEXT, canon_id TEXT, "
                    " cluster_score FLOAT, PRIMARY KEY(id))"
                )
        with open("pairs.sql", "r", encoding="utf-8") as file:
            pairs_sql = file.read()
        with self.read_con.cursor(
            "pairs", cursor_factory=psycopg2.extensions.cursor
        ) as read_cur:
            read_cur.execute(pairs_sql)
            print("clustering...")
            clustered_dupes = model.cluster(
                model.score(self.record_pairs(read_cur)), threshold=self.match_threshold
            )
            print("writing results to database")
            with self.write_con:
                with self.write_con.cursor() as write_cur:
                    write_cur.copy_expert(
                        "COPY entity_map FROM STDIN WITH CSV",
                        Readable(cluster_ids(clustered_dupes)),
                        size=10000,
                    )
        with self.write_con:
            with self.write_con.cursor() as cur:
                cur.execute("CREATE INDEX head_index ON entity_map (canon_id)")

    def record_pairs(self, result_set):
        for i, row in enumerate(result_set):
            a_record_id, a_record, b_record_id, b_record = row
            record_a = (a_record_id, a_record)
            record_b = (b_record_id, b_record)

            yield record_a, record_b

            if i % 10000 == 0:
                print(i)


def cluster_ids(clustered_dupes):
    for cluster, scores in clustered_dupes:
        cluster_id = cluster[0]
        for donor_id, score in zip(cluster, scores):
            yield donor_id, cluster_id, score

1	from os import listdir	1✔
2	from os.path import isfile, join	1✔
3	import dedupe	1✔
4	import psycopg2	1✔
5	import psycopg2.extras	1✔
6	from config import settings	1✔
7	from src.marc_to_db import MarcToDb	1✔
8	from src.machine_learning_model import MachineLearningModel	1✔
9	from src.readable import Readable	1✔
10
11	RECORD_SELECT = """SELECT	1✔
12	id, title, author, publication_year, pagination, edition, publisher_name, type_of, is_electronic_resource
13	FROM records;
14	"""
15
16
17	class DbDedupeRecords(MachineLearningModel):	1✔
18	def __init__(self, input_directory, output_directory, match_threshold=0.5):	1✔
19	super().__init__(output_directory, match_threshold)	1✔
20	for path in listdir(input_directory):	1✔
21	full_path = join(input_directory, path)	1✔
22	if isfile(full_path):	1✔
23	# save to database
24	MarcToDb(full_path).to_db()	1✔
25	self.read_con = psycopg2.connect(	1✔
26	database=settings.db_name,
27	user=settings.db_user,
28	host=settings.db_host,
29	port=settings.db_port,
30	cursor_factory=psycopg2.extras.RealDictCursor,
31	)
32	self.write_con = psycopg2.connect(	1✔
33	database=settings.db_name,
34	user=settings.db_user,
35	host=settings.db_host,
36	port=settings.db_port,
37	)
38
39	def deduper(self):	1✔
40	try:	1✔
41	with open(self.settings_file_path, "rb") as sf:	1✔
NEW 42	print("reading from", self.settings_file_path)	×
NEW 43	model = dedupe.StaticDedupe(sf)	×
44	except FileNotFoundError:	1✔
45	model = dedupe.Dedupe(self.fields())	1✔
46	model = self.train_and_write_model(model)	1✔
47
48	return model	1✔
49
50	def prepare_training(self, model):	1✔
51	with self.read_con.cursor("donor_select") as cur:	1✔
52	cur.execute(RECORD_SELECT)	1✔
53	# temp_d = {i: row for i, row in enumerate(cur)}
54	temp_d = dict(enumerate(cur))	1✔
55	try:	1✔
56	with open(self.training_file_path, encoding="utf-8") as tf:	1✔
57	print(f'Loading training data from {self.training_file_path} - you can skip console label if you would like')	1✔
58	return model.prepare_training(temp_d, training_file=tf)	1✔
59	except FileNotFoundError:	1✔
60	return model.prepare_training(temp_d)	1✔
61
NEW 62	del temp_d	×
63
64	def train_and_write_model(self, model):	1✔
65	self.prepare_training(model)	1✔
66	self.console_label(model)	1✔
67	model.train()	1✔
68	# When finished, save our training away to disk
69	self.write_training(model)	1✔
70	self.write_settings(model)	1✔
71	# Remove memory intensive objects used for training
72	model.cleanup_training()	1✔
73	return model	1✔
74
75	def block(self, model):	1✔
76	print("blocking...")	1✔
77	print("creating blocking_map table")	1✔
78	with self.write_con:	1✔
79	with self.write_con.cursor() as cur:	1✔
80	cur.execute("DROP TABLE IF EXISTS blocking_map")	1✔
81	cur.execute("CREATE TABLE blocking_map (block_key text, id TEXT)")	1✔
82	print("creating inverted index")	1✔
83	for field in model.fingerprinter.index_fields:	1✔
84	with self.read_con.cursor("field_values") as cur:	1✔
85	cur.execute(f"SELECT DISTINCT {field} FROM records")	1✔
86	field_data = (row[field] for row in cur)	1✔
87	model.fingerprinter.index(field_data, field)	1✔
88
89	print("writing blocking map")	1✔
90	with self.read_con.cursor("donor_select") as read_cur:	1✔
91	read_cur.execute(RECORD_SELECT)	1✔
92
93	full_data = ((row["id"], row) for row in read_cur)	1✔
94	b_data = model.fingerprinter(full_data)	1✔
95
96	with self.write_con:	1✔
97	with self.write_con.cursor() as write_cur:	1✔
98	write_cur.copy_expert(	1✔
99	"COPY blocking_map FROM STDIN WITH CSV",
100	Readable(b_data),
101	size=10000,
102	)
103
104	model.fingerprinter.reset_indices()	1✔
105	print("indexing block_key")	1✔
106	with self.write_con:	1✔
107	with self.write_con.cursor() as cur:	1✔
108	cur.execute(	1✔
109	"CREATE UNIQUE INDEX ON blocking_map "
110	"(block_key text_pattern_ops, id)"
111	)
112
113	def cluster(self, model):	1✔
114	with self.write_con:	1✔
115	with self.write_con.cursor() as cur:	1✔
116	cur.execute("DROP TABLE IF EXISTS entity_map")	1✔
117
118	print("creating entity_map database")	1✔
119	cur.execute(	1✔
120	"CREATE TABLE entity_map "
121	"(id TEXT, canon_id TEXT, "
122	" cluster_score FLOAT, PRIMARY KEY(id))"
123	)
124	with open("pairs.sql", "r", encoding="utf-8") as file:	1✔
125	pairs_sql = file.read()	1✔
126	with self.read_con.cursor(	1✔
127	"pairs", cursor_factory=psycopg2.extensions.cursor
128	) as read_cur:
129	read_cur.execute(pairs_sql)	1✔
130	print("clustering...")	1✔
131	clustered_dupes = model.cluster(	1✔
132	model.score(self.record_pairs(read_cur)), threshold=self.match_threshold
133	)
134	print("writing results to database")	1✔
135	with self.write_con:	1✔
136	with self.write_con.cursor() as write_cur:	1✔
137	write_cur.copy_expert(	1✔
138	"COPY entity_map FROM STDIN WITH CSV",
139	Readable(cluster_ids(clustered_dupes)),
140	size=10000,
141	)
142	with self.write_con:	1✔
143	with self.write_con.cursor() as cur:	1✔
144	cur.execute("CREATE INDEX head_index ON entity_map (canon_id)")	1✔
145
146	def record_pairs(self, result_set):	1✔
147	for i, row in enumerate(result_set):	1✔
148	a_record_id, a_record, b_record_id, b_record = row	1✔
149	record_a = (a_record_id, a_record)	1✔
150	record_b = (b_record_id, b_record)	1✔
151
152	yield record_a, record_b	1✔
153
154	if i % 10000 == 0:	1✔
155	print(i)	1✔
156
157
158	def cluster_ids(clustered_dupes):	1✔
159	for cluster, scores in clustered_dupes:	1✔
160	cluster_id = cluster[0]	1✔
161	for donor_id, score in zip(cluster, scores):	1✔
162	yield donor_id, cluster_id, score	1✔

pulibrary / pymarc_dedupe / 4f06df0c-5da3-4707-bade-63cea0efc8d5

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous