Create main runner app, with async multithreading

2024-01-17 16:06:15 -06:00
parent 33671683ea
commit 01526524d4
8 changed files with 263800 additions and 55 deletions
--- a/read_datasheet.py
+++ b/read_datasheet.py
@@ -8,13 +8,14 @@ import numpy as np
 from PIL import Image
 import io
 import json
+from util import fprint

 def parse(filename, output_dir):

    # Extract table data

    tables = camelot.read_pdf(filename, pages="1-end", flavor='lattice', backend="poppler", split_text=False, line_scale=100, process_background=True, resolution=600, interations=1, layout_kwargs={'detect_vertical': False, 'char_margin': 0.5}, shift_text=['r', 't'])
-    #print("Total tables extracted:", tables.n)
+    #fprint("Total tables extracted:", tables.n)
    n = 0
    pagenum = 0
    reader = PdfReader(filename)
@@ -27,10 +28,10 @@ def parse(filename, output_dir):
        table.df.replace(np.nan, '', inplace=True)
        
        if not table.df.empty:
-            #print("\nTable " + str(n))
+            #fprint("\nTable " + str(n))
            # Extract table names
            table_start = table.cells[0][0].lt[1] # Read top-left cell's top-left coordinate
-            #print(table_start)
+            #fprint(table_start)
            ymin = table_start
            ymax = table_start + 10
            if pagenum != table.page - 1:
@@ -46,20 +47,20 @@ def parse(filename, output_dir):
            text_body = "".join(parts).strip('\n')
            if len(text_body) == 0:
                text_body = str(n)
-            #print(text_body)
+            #fprint(text_body)
            

            table_list[text_body] = table.df
            #table.to_html("table" + str(n) + ".html")
            
-            #print(table.df)
+            #fprint(table.df)
            #camelot.plot(table, kind='grid').savefig("test" + str(n) + ".png")
            n=n+1
    #camelot.plot(tables[0], kind='grid').savefig("test.png")
    
    #tables.export(output_dir + '/techdata.json', f='json')

-    # print(table_list)
+    # fprint(table_list)
    # Extract Basic details - part name & description, image, etc

    reader = PdfReader(filename)
@@ -68,7 +69,7 @@ def parse(filename, output_dir):
    skip = False
    for image_file_object in page.images:
        if image_file_object.name == "img0.png" and skip == False:
-            #print(Image.open(io.BytesIO(image_file_object.data)).mode)
+            #fprint(Image.open(io.BytesIO(image_file_object.data)).mode)
            if Image.open(io.BytesIO(image_file_object.data)).mode == "P":
                skip = True
                continue
@@ -137,20 +138,20 @@ def parse(filename, output_dir):

        # multi-page table check
        if table_name.isdigit() and len(tables) > 1:
-            print(table_name)
-            print(previous_table)
+            fprint(table_name)
+            fprint(previous_table)
            
            
            
            
            main_key = previous_table
            cont_key = table_name
-            print(tables)
+            fprint(tables)
            if vertical == False:
                main_keys = list(tables[main_key].keys())
                for i, (cont_key, cont_values) in enumerate(tables[cont_key].items()):
                    if i < len(main_keys):
-                        print(tables[main_key][main_keys[i]])
+                        fprint(tables[main_key][main_keys[i]])
                        tables[main_key][main_keys[i]] = (tables[main_key][main_keys[i]] + (cont_key,) + cont_values)

                del tables[table_name]
@@ -163,7 +164,7 @@ def parse(filename, output_dir):
        previous_table = table_name
    

-    print(tables)
+    fprint(tables)
    with open(output_dir + "/tables.json", 'w') as json_file:
        json.dump(tables, json_file)