From 8cf004fd511aeecb8e8bc96e5db4c4da9a15988f Mon Sep 17 00:00:00 2001
From: Charles Paperman <cpaperma@links-networkdisk-backend.lille.inria.fr>
Date: Sat, 30 Sep 2023 16:13:21 +0200
Subject: [PATCH] fix ingestion

---
 bibendum/ingestions/bulk_process.py             | 6 +++---
 bibendum/ingestions/queries/copy.py             | 3 ++-
 bibendum/ingestions/queries/insert.py           | 6 +++---
 bibendum/ingestions/queries/others.py           | 2 +-
 bibendum/ingestions/queries/temporary_tables.py | 4 ++--
 bibendum/ingestions/queries/update.py           | 8 ++++----
 bibendum/sql.py                                 | 6 +++---
 update_ingested_script.py                       | 6 +++---
 8 files changed, 21 insertions(+), 20 deletions(-)

diff --git a/bibendum/ingestions/bulk_process.py b/bibendum/ingestions/bulk_process.py
index 98c73e7..934ea51 100644
--- a/bibendum/ingestions/bulk_process.py
+++ b/bibendum/ingestions/bulk_process.py
@@ -75,7 +75,7 @@ def bulk_insert_containers(cursor, notices, notice_cls):
 
     cursor.execute(insert.from_temp(container_cls, *fields)) # Update non-existing containers
     d = dict(cursor.execute(others.uuid_raw(container_cls)))
-    cursor.execute(f"DROP TABLE temp_{cont_name}")
+    cursor.execute(f"DROP TABLE temp_{cont_name.value}")
     # update duplicates to point toward the appropriate uuid
     return d
 
@@ -84,7 +84,7 @@ def bulk_insert_notices(cursor, notices, containers_map, notice_cls):
     fields.difference_update(("authors", "editors", "identifiers", "id", "advisors", "jury", "institutions"))
     cont_id = False
     if notice_cls.container_cls:
-        cont_id = notice_cls.container_cls.get_document_type+"_id"
+        cont_id = notice_cls.container_cls.get_document_type.value+"_id"
         fields.remove("container")
         fields.add(cont_id)
         def genargs(raw_uid, notice):
@@ -132,7 +132,7 @@ def bulk_insert_notices(cursor, notices, containers_map, notice_cls):
     # update identifiers for olds notices
 
     d = dict(cursor.execute(others.uuid_raw(notice_cls))) 
-    cursor.execute(f"DROP TABLE temp_{notice_cls.get_document_type}")
+    cursor.execute(f"DROP TABLE temp_{notice_cls.get_document_type.value}")
     return d
     
 
diff --git a/bibendum/ingestions/queries/copy.py b/bibendum/ingestions/queries/copy.py
index 8becdcf..f22ab79 100644
--- a/bibendum/ingestions/queries/copy.py
+++ b/bibendum/ingestions/queries/copy.py
@@ -4,10 +4,11 @@ from bibendum.ingestions.utils import get_tables_name
 raw_notice = "COPY temp_raw_notice (notice, source, source_identifier) FROM STDIN"
 person = "COPY temp_person (first_name, middle, last_name, targets) FROM STDIN"
 def container_template(cont_name, *fields):
+    cont_name = cont_name.value
     fields = ", ".join(fields) + ","
     return f"COPY temp_{cont_name} ({fields} raws_id, new) FROM STDIN"
 
 def notice_template(cls, *fields):
-    name = cls.get_document_type
+    name = cls.get_document_type.value
     fields = ", ".join(fields) + ","
     return f"COPY temp_{name} ({fields} raws_id, identifiers, new) FROM STDIN"
diff --git a/bibendum/ingestions/queries/insert.py b/bibendum/ingestions/queries/insert.py
index 8884f2e..f3b7104 100644
--- a/bibendum/ingestions/queries/insert.py
+++ b/bibendum/ingestions/queries/insert.py
@@ -8,7 +8,7 @@ INSERT INTO raw_notice(notice, source, source_identifier, processed)
 """
 
 def from_temp(cls, *fields):
-    name = cls.get_document_type
+    name = cls.get_document_type.value
     fields = ",".join(fields)
     return f"""
 INSERT INTO {name} (id, {fields}) 
@@ -19,7 +19,7 @@ INSERT INTO {name} (id, {fields})
 """
 
 def notice_identifier_new(cls):
-    name = cls.get_document_type
+    name = cls.get_document_type.value
     return f"""
 INSERT INTO notice_identifier(notice_id, identifiers)
     SELECT id, identifiers FROM
@@ -29,7 +29,7 @@ INSERT INTO notice_identifier(notice_id, identifiers)
 """
 
 def notice_raws_new(cls):
-    name = cls.get_document_type
+    name = cls.get_document_type.value
     return f"""
 INSERT INTO notice_to_raw (notice_id, raws_id)
     SELECT id, raws_id FROM
diff --git a/bibendum/ingestions/queries/others.py b/bibendum/ingestions/queries/others.py
index 6db7791..cee625b 100644
--- a/bibendum/ingestions/queries/others.py
+++ b/bibendum/ingestions/queries/others.py
@@ -3,6 +3,6 @@ ingested_raw_unsplit = "SELECT id, notice FROM raw_notice WHERE processed='Inges
 
 
 def uuid_raw(cls):
-    name = cls.get_document_type
+    name = cls.get_document_type.value
     return f"""
 SELECT raw_id, id FROM temp_{name}, UNNEST(raws_id) as raw_id"""
diff --git a/bibendum/ingestions/queries/temporary_tables.py b/bibendum/ingestions/queries/temporary_tables.py
index 3991a44..d0417dd 100644
--- a/bibendum/ingestions/queries/temporary_tables.py
+++ b/bibendum/ingestions/queries/temporary_tables.py
@@ -6,7 +6,7 @@ raw_notice = "CREATE TEMPORARY TABLE temp_raw_notice AS SELECT * FROM raw_notice
 doctype_to_class = notice.Notice.doctype_to_class
 
 def create_cont_temp(cls, *fields):
-    name = cls.get_document_type
+    name = cls.get_document_type.value
     fields = "\n        " + ",\n        ".join(fields)+","
     return f"""
 CREATE TEMPORARY TABLE temp_{name} AS 
@@ -19,7 +19,7 @@ CREATE TEMPORARY TABLE temp_{name} AS
 """
 
 def create_not_temp(cls, *fields):
-    name = cls.get_document_type
+    name = cls.get_document_type.value
     fields = "\n      " + ",\n      ".join(fields)+","
     return f"""
 CREATE TEMPORARY TABLE temp_{name} AS 
diff --git a/bibendum/ingestions/queries/update.py b/bibendum/ingestions/queries/update.py
index 072f3a2..2ae6fb8 100644
--- a/bibendum/ingestions/queries/update.py
+++ b/bibendum/ingestions/queries/update.py
@@ -3,7 +3,7 @@ flag_notices_success = "UPDATE raw_notice SET processed='Processed'::RAW_STATUS
 
 
 def from_temporary(cls):
-    name = cls.get_document_type
+    name = cls.get_document_type.value
     ifields = ", ".join(f"i.{e}" for e in cls.unique_keys)
     tfields = ", ".join(f"t.{e}" for e in cls.unique_keys)
     return f"""
@@ -16,7 +16,7 @@ UPDATE temp_{name} as t
 """
 
 def temporary_uuid(cls, cont_id=False):
-    name = cls.get_document_type
+    name = cls.get_document_type.value
     fields = list(cls.unique_keys)
     if cont_id:
         idx = fields.index("container") 
@@ -51,7 +51,7 @@ UPDATE temp_person
 """
 
 def notice_identifier_old(cls):
-    name = cls.get_document_type
+    name = cls.get_document_type.value
     return f"""
 UPDATE notice_identifier as i
     SET  
@@ -71,7 +71,7 @@ UPDATE notice_identifier as i
 """
 
 def notice_raw_old(cls):
-    name = cls.get_document_type
+    name = cls.get_document_type.value
     return f"""
 UPDATE notice_to_raw as i
     SET  
diff --git a/bibendum/sql.py b/bibendum/sql.py
index 5bb007a..5559c7c 100644
--- a/bibendum/sql.py
+++ b/bibendum/sql.py
@@ -37,7 +37,7 @@ SELECT base.*, raws_id FROM
         tbl_name = document_type.value
         join = f"""
 {tbl_name} INNER JOIN notice_identifier ON {tbl_name}.id = notice_identifier.notice_id
-    INNER JOIN notice_to_raw ON {tbl_name}.id = notice_to_raw.notice_id
+    INNER JOIN notice_to_raw ON {tbl_name}.id = notice_to_raw.notice_id INNER JOIN raw_notice 
 """
         cont_data = ""
         if notice.container_cls:
@@ -45,10 +45,10 @@ SELECT base.*, raws_id FROM
             join = f"{join} LEFT JOIN {cnt_name} ON {tbl_name}.{cnt_name}_id = {cnt_name}.id"
             cont_data = f", row_to_json({cnt_name}.*) as container"
         return f"""
-SELECT {tbl_name}.*,p.authors{cont_data}, to_json(notice_identifier.identifiers) as identifiers, notice_to_raw.raws_id
+SELECT {tbl_name}.*,p.authors{cont_data}, to_json(notice_identifier.identifiers) as identifiers, ROW(raw_notice.*)
 FROM {join} LEFT JOIN
     (
-        SELECT target_id, json_agg(ROW(T.id, T.first_name, T.middle, T.last_name)::person) as authors
+        SELECT target_id, json_agg(ROW(T.id, T.first_name, T.middle, T.last_name)::person) as authors, 
         FROM
             (SELECT target_id, person.*
             FROM 
diff --git a/update_ingested_script.py b/update_ingested_script.py
index 8d15f6b..a68b8a7 100644
--- a/update_ingested_script.py
+++ b/update_ingested_script.py
@@ -2,9 +2,9 @@ import sys
 from bibendum.models import NoticeSource
 import bibendum.ingestions as ing
 from multiprocessing import Process
-size = 50000
-procnb_crossref = 5 
-procnb_hal = 2
+size = 200000
+procnb_crossref = 1 
+procnb_hal = 1
 
 def ingest(source):
     ing.bulk_process.process_raw_notices_unsplit(dict(dbname="bibendum"), source, size=size)
-- 
GitLab