From 8cf004fd511aeecb8e8bc96e5db4c4da9a15988f Mon Sep 17 00:00:00 2001 From: Charles Paperman <cpaperma@links-networkdisk-backend.lille.inria.fr> Date: Sat, 30 Sep 2023 16:13:21 +0200 Subject: [PATCH] fix ingestion --- bibendum/ingestions/bulk_process.py | 6 +++--- bibendum/ingestions/queries/copy.py | 3 ++- bibendum/ingestions/queries/insert.py | 6 +++--- bibendum/ingestions/queries/others.py | 2 +- bibendum/ingestions/queries/temporary_tables.py | 4 ++-- bibendum/ingestions/queries/update.py | 8 ++++---- bibendum/sql.py | 6 +++--- update_ingested_script.py | 6 +++--- 8 files changed, 21 insertions(+), 20 deletions(-) diff --git a/bibendum/ingestions/bulk_process.py b/bibendum/ingestions/bulk_process.py index 98c73e7..934ea51 100644 --- a/bibendum/ingestions/bulk_process.py +++ b/bibendum/ingestions/bulk_process.py @@ -75,7 +75,7 @@ def bulk_insert_containers(cursor, notices, notice_cls): cursor.execute(insert.from_temp(container_cls, *fields)) # Update non-existing containers d = dict(cursor.execute(others.uuid_raw(container_cls))) - cursor.execute(f"DROP TABLE temp_{cont_name}") + cursor.execute(f"DROP TABLE temp_{cont_name.value}") # update duplicates to point toward the appropriate uuid return d @@ -84,7 +84,7 @@ def bulk_insert_notices(cursor, notices, containers_map, notice_cls): fields.difference_update(("authors", "editors", "identifiers", "id", "advisors", "jury", "institutions")) cont_id = False if notice_cls.container_cls: - cont_id = notice_cls.container_cls.get_document_type+"_id" + cont_id = notice_cls.container_cls.get_document_type.value+"_id" fields.remove("container") fields.add(cont_id) def genargs(raw_uid, notice): @@ -132,7 +132,7 @@ def bulk_insert_notices(cursor, notices, containers_map, notice_cls): # update identifiers for olds notices d = dict(cursor.execute(others.uuid_raw(notice_cls))) - cursor.execute(f"DROP TABLE temp_{notice_cls.get_document_type}") + cursor.execute(f"DROP TABLE temp_{notice_cls.get_document_type.value}") return d diff --git a/bibendum/ingestions/queries/copy.py b/bibendum/ingestions/queries/copy.py index 8becdcf..f22ab79 100644 --- a/bibendum/ingestions/queries/copy.py +++ b/bibendum/ingestions/queries/copy.py @@ -4,10 +4,11 @@ from bibendum.ingestions.utils import get_tables_name raw_notice = "COPY temp_raw_notice (notice, source, source_identifier) FROM STDIN" person = "COPY temp_person (first_name, middle, last_name, targets) FROM STDIN" def container_template(cont_name, *fields): + cont_name = cont_name.value fields = ", ".join(fields) + "," return f"COPY temp_{cont_name} ({fields} raws_id, new) FROM STDIN" def notice_template(cls, *fields): - name = cls.get_document_type + name = cls.get_document_type.value fields = ", ".join(fields) + "," return f"COPY temp_{name} ({fields} raws_id, identifiers, new) FROM STDIN" diff --git a/bibendum/ingestions/queries/insert.py b/bibendum/ingestions/queries/insert.py index 8884f2e..f3b7104 100644 --- a/bibendum/ingestions/queries/insert.py +++ b/bibendum/ingestions/queries/insert.py @@ -8,7 +8,7 @@ INSERT INTO raw_notice(notice, source, source_identifier, processed) """ def from_temp(cls, *fields): - name = cls.get_document_type + name = cls.get_document_type.value fields = ",".join(fields) return f""" INSERT INTO {name} (id, {fields}) @@ -19,7 +19,7 @@ INSERT INTO {name} (id, {fields}) """ def notice_identifier_new(cls): - name = cls.get_document_type + name = cls.get_document_type.value return f""" INSERT INTO notice_identifier(notice_id, identifiers) SELECT id, identifiers FROM @@ -29,7 +29,7 @@ INSERT INTO notice_identifier(notice_id, identifiers) """ def notice_raws_new(cls): - name = cls.get_document_type + name = cls.get_document_type.value return f""" INSERT INTO notice_to_raw (notice_id, raws_id) SELECT id, raws_id FROM diff --git a/bibendum/ingestions/queries/others.py b/bibendum/ingestions/queries/others.py index 6db7791..cee625b 100644 --- a/bibendum/ingestions/queries/others.py +++ b/bibendum/ingestions/queries/others.py @@ -3,6 +3,6 @@ ingested_raw_unsplit = "SELECT id, notice FROM raw_notice WHERE processed='Inges def uuid_raw(cls): - name = cls.get_document_type + name = cls.get_document_type.value return f""" SELECT raw_id, id FROM temp_{name}, UNNEST(raws_id) as raw_id""" diff --git a/bibendum/ingestions/queries/temporary_tables.py b/bibendum/ingestions/queries/temporary_tables.py index 3991a44..d0417dd 100644 --- a/bibendum/ingestions/queries/temporary_tables.py +++ b/bibendum/ingestions/queries/temporary_tables.py @@ -6,7 +6,7 @@ raw_notice = "CREATE TEMPORARY TABLE temp_raw_notice AS SELECT * FROM raw_notice doctype_to_class = notice.Notice.doctype_to_class def create_cont_temp(cls, *fields): - name = cls.get_document_type + name = cls.get_document_type.value fields = "\n " + ",\n ".join(fields)+"," return f""" CREATE TEMPORARY TABLE temp_{name} AS @@ -19,7 +19,7 @@ CREATE TEMPORARY TABLE temp_{name} AS """ def create_not_temp(cls, *fields): - name = cls.get_document_type + name = cls.get_document_type.value fields = "\n " + ",\n ".join(fields)+"," return f""" CREATE TEMPORARY TABLE temp_{name} AS diff --git a/bibendum/ingestions/queries/update.py b/bibendum/ingestions/queries/update.py index 072f3a2..2ae6fb8 100644 --- a/bibendum/ingestions/queries/update.py +++ b/bibendum/ingestions/queries/update.py @@ -3,7 +3,7 @@ flag_notices_success = "UPDATE raw_notice SET processed='Processed'::RAW_STATUS def from_temporary(cls): - name = cls.get_document_type + name = cls.get_document_type.value ifields = ", ".join(f"i.{e}" for e in cls.unique_keys) tfields = ", ".join(f"t.{e}" for e in cls.unique_keys) return f""" @@ -16,7 +16,7 @@ UPDATE temp_{name} as t """ def temporary_uuid(cls, cont_id=False): - name = cls.get_document_type + name = cls.get_document_type.value fields = list(cls.unique_keys) if cont_id: idx = fields.index("container") @@ -51,7 +51,7 @@ UPDATE temp_person """ def notice_identifier_old(cls): - name = cls.get_document_type + name = cls.get_document_type.value return f""" UPDATE notice_identifier as i SET @@ -71,7 +71,7 @@ UPDATE notice_identifier as i """ def notice_raw_old(cls): - name = cls.get_document_type + name = cls.get_document_type.value return f""" UPDATE notice_to_raw as i SET diff --git a/bibendum/sql.py b/bibendum/sql.py index 5bb007a..5559c7c 100644 --- a/bibendum/sql.py +++ b/bibendum/sql.py @@ -37,7 +37,7 @@ SELECT base.*, raws_id FROM tbl_name = document_type.value join = f""" {tbl_name} INNER JOIN notice_identifier ON {tbl_name}.id = notice_identifier.notice_id - INNER JOIN notice_to_raw ON {tbl_name}.id = notice_to_raw.notice_id + INNER JOIN notice_to_raw ON {tbl_name}.id = notice_to_raw.notice_id INNER JOIN raw_notice """ cont_data = "" if notice.container_cls: @@ -45,10 +45,10 @@ SELECT base.*, raws_id FROM join = f"{join} LEFT JOIN {cnt_name} ON {tbl_name}.{cnt_name}_id = {cnt_name}.id" cont_data = f", row_to_json({cnt_name}.*) as container" return f""" -SELECT {tbl_name}.*,p.authors{cont_data}, to_json(notice_identifier.identifiers) as identifiers, notice_to_raw.raws_id +SELECT {tbl_name}.*,p.authors{cont_data}, to_json(notice_identifier.identifiers) as identifiers, ROW(raw_notice.*) FROM {join} LEFT JOIN ( - SELECT target_id, json_agg(ROW(T.id, T.first_name, T.middle, T.last_name)::person) as authors + SELECT target_id, json_agg(ROW(T.id, T.first_name, T.middle, T.last_name)::person) as authors, FROM (SELECT target_id, person.* FROM diff --git a/update_ingested_script.py b/update_ingested_script.py index 8d15f6b..a68b8a7 100644 --- a/update_ingested_script.py +++ b/update_ingested_script.py @@ -2,9 +2,9 @@ import sys from bibendum.models import NoticeSource import bibendum.ingestions as ing from multiprocessing import Process -size = 50000 -procnb_crossref = 5 -procnb_hal = 2 +size = 200000 +procnb_crossref = 1 +procnb_hal = 1 def ingest(source): ing.bulk_process.process_raw_notices_unsplit(dict(dbname="bibendum"), source, size=size) -- GitLab