Merge branch 'main' into dthomas_meilisearch

# Conflicts: # .gitignore # read_datasheet.py
2024-03-12 16:13:41 -05:00
parent a63faba2aa dd0ac46662
commit 5ef8795eb4
18 changed files with 1559 additions and 120 deletions
--- a/read_datasheet.py
+++ b/read_datasheet.py
@@ -2,6 +2,8 @@

 # Parse Belden catalog techdata datasheets 

+import pandas as pd
+pd.set_option('future.no_silent_downcasting', True)
 from PyPDF2 import PdfReader
 import camelot
 import numpy as np
@@ -11,6 +13,11 @@ import json
 from util import fprint
 import uuid
 from util import run_cmd
+import os
+
+def touch(path):
+    with open(path, 'a'):
+        os.utime(path, None)

 def parse(filename, output_dir, partnum, dstype):

@@ -23,6 +30,7 @@ def parse(filename, output_dir, partnum, dstype):
    reader = PdfReader(filename)
    page = reader.pages[0]
    table_list = {}
+
    for table in tables:
        table.df.infer_objects(copy=False)
        table.df.replace('', np.nan, inplace=True)
@@ -90,6 +98,7 @@ def parse(filename, output_dir, partnum, dstype):

    # Table parsing and reordring
    tables = dict()
+    torename = dict()
    previous_table = ""
    for table_name in table_list.keys():
        # determine shape: horizontal or vertical
@@ -121,7 +130,8 @@ def parse(filename, output_dir, partnum, dstype):
        for table_name_2 in table_list.keys(): 
            if table_name_2.find(table.iloc[-1, 0]) >= 0:
                # Name taken from table directly above - this table does not have a name
-                table_list["Specs " + str(len(tables))] = table_list.pop(table_name_2, None) # rename table to arbitrary altername name
+                torename[table_name_2] = "Specs " + str(len(tables))
+                #table_list["Specs " + str(len(tables))] = table_list[table_name_2] # rename table to arbitrary altername name
                break

        if vertical:
@@ -142,21 +152,21 @@ def parse(filename, output_dir, partnum, dstype):
        # multi-page table check
        if dstype == "Belden":
            if table_name.isdigit() and len(tables) > 1:
-                fprint(table_name)
-                fprint(previous_table)
+                #fprint(table_name)
+                #fprint(previous_table)
                
                
                
                
                main_key = previous_table
                cont_key = table_name
-                fprint(tables)
+                #fprint(tables)
                if vertical == False:
                    main_keys = list(tables[main_key].keys())
                    for i, (cont_key, cont_values) in enumerate(tables[cont_key].items()):
                        if i < len(main_keys):
-                            fprint(tables[main_key][main_keys[i]])
-                            tables[main_key][main_keys[i]] = (tables[main_key][main_keys[i]] + (cont_key,) + cont_values)
+                            #fprint(tables[main_key][main_keys[i]])
+                            tables[main_key][main_keys[i]] = (tuple(tables[main_key][main_keys[i]]) + (cont_key,) + cont_values)
    
                    del tables[table_name]
    
@@ -167,6 +177,10 @@ def parse(filename, output_dir, partnum, dstype):

        previous_table = table_name
    
+    # remove renamed tables
+    for table_name in torename.keys():
+        tables[torename[table_name]] = tables[table_name]
+        del tables[table_name]
    # remove multi-line values that occasionally squeak through
    def replace_newlines_in_dict(d):
        for key, value in d.items():
@@ -195,12 +209,12 @@ def parse(filename, output_dir, partnum, dstype):
    


-    print(output_table)
+    #print(output_table)

-    run_cmd("rm " + output_dir + "/*.json") # not reliable!
+    run_cmd("rm \"" + output_dir + "\"/*.json") # not reliable!
    with open(output_dir + "/" + output_table["searchspecs"]["id"] + ".json", 'w') as json_file:
        json.dump(output_table["searchspecs"], json_file)
-
+    touch(output_dir + "/parsed")
    return output_table


@@ -217,7 +231,7 @@ def flatten(tables):
                # If it fails again, return the original string.
                return s
    out = dict()
-    print("{")
+    #print("{")
    for table in tables.keys():
        for key in tables[table].keys():
            if len(key) < 64:
@@ -228,11 +242,16 @@ def flatten(tables):
            fullkeyname = (table + ": " + keyname).replace(".","")
            if type(tables[table][key]) is not tuple:
                out[fullkeyname] = convert_to_number(tables[table][key])
-                print("\"" + keyname + "\":", "\"" + str(out[fullkeyname]) + "\",")
+                #print("\"" + keyname + "\":", "\"" + str(out[fullkeyname]) + "\",")
            elif len(tables[table][key]) == 1:
                out[fullkeyname] = convert_to_number(tables[table][key][0])
                
-                print("\"" + keyname + "\":", "\"" + str(out[fullkeyname]) + "\",")
+                #print("\"" + keyname + "\":", "\"" + str(out[fullkeyname]) + "\",")
+
+            # if the item has at least two commas in it, split it
+            if tables[table][key].count(',') > 0:
+                out[fullkeyname] = list(map(lambda x: x.strip(), tables[table][key].split(",")))
+                #print("\"" + keyname + "\":", "\"" + str(out[fullkeyname]) + "\",")

            # if the item has at least two commas in it, split it
            if tables[table][key].count(',') > 0:
@@ -240,7 +259,7 @@ def flatten(tables):
                print("\"" + keyname + "\":", "\"" + str(out[fullkeyname]) + "\",")


-    print("}")
+    #print("}")
    return out