Merge branch 'main' into dthomas_meilisearch

# Conflicts:
#	.gitignore
#	read_datasheet.py
This commit is contained in:
2024-03-12 16:13:41 -05:00
18 changed files with 1559 additions and 120 deletions

View File

@@ -2,6 +2,8 @@
# Parse Belden catalog techdata datasheets
import pandas as pd
pd.set_option('future.no_silent_downcasting', True)
from PyPDF2 import PdfReader
import camelot
import numpy as np
@@ -11,6 +13,11 @@ import json
from util import fprint
import uuid
from util import run_cmd
import os
def touch(path):
with open(path, 'a'):
os.utime(path, None)
def parse(filename, output_dir, partnum, dstype):
@@ -23,6 +30,7 @@ def parse(filename, output_dir, partnum, dstype):
reader = PdfReader(filename)
page = reader.pages[0]
table_list = {}
for table in tables:
table.df.infer_objects(copy=False)
table.df.replace('', np.nan, inplace=True)
@@ -90,6 +98,7 @@ def parse(filename, output_dir, partnum, dstype):
# Table parsing and reordring
tables = dict()
torename = dict()
previous_table = ""
for table_name in table_list.keys():
# determine shape: horizontal or vertical
@@ -121,7 +130,8 @@ def parse(filename, output_dir, partnum, dstype):
for table_name_2 in table_list.keys():
if table_name_2.find(table.iloc[-1, 0]) >= 0:
# Name taken from table directly above - this table does not have a name
table_list["Specs " + str(len(tables))] = table_list.pop(table_name_2, None) # rename table to arbitrary altername name
torename[table_name_2] = "Specs " + str(len(tables))
#table_list["Specs " + str(len(tables))] = table_list[table_name_2] # rename table to arbitrary altername name
break
if vertical:
@@ -142,21 +152,21 @@ def parse(filename, output_dir, partnum, dstype):
# multi-page table check
if dstype == "Belden":
if table_name.isdigit() and len(tables) > 1:
fprint(table_name)
fprint(previous_table)
#fprint(table_name)
#fprint(previous_table)
main_key = previous_table
cont_key = table_name
fprint(tables)
#fprint(tables)
if vertical == False:
main_keys = list(tables[main_key].keys())
for i, (cont_key, cont_values) in enumerate(tables[cont_key].items()):
if i < len(main_keys):
fprint(tables[main_key][main_keys[i]])
tables[main_key][main_keys[i]] = (tables[main_key][main_keys[i]] + (cont_key,) + cont_values)
#fprint(tables[main_key][main_keys[i]])
tables[main_key][main_keys[i]] = (tuple(tables[main_key][main_keys[i]]) + (cont_key,) + cont_values)
del tables[table_name]
@@ -167,6 +177,10 @@ def parse(filename, output_dir, partnum, dstype):
previous_table = table_name
# remove renamed tables
for table_name in torename.keys():
tables[torename[table_name]] = tables[table_name]
del tables[table_name]
# remove multi-line values that occasionally squeak through
def replace_newlines_in_dict(d):
for key, value in d.items():
@@ -195,12 +209,12 @@ def parse(filename, output_dir, partnum, dstype):
print(output_table)
#print(output_table)
run_cmd("rm " + output_dir + "/*.json") # not reliable!
run_cmd("rm \"" + output_dir + "\"/*.json") # not reliable!
with open(output_dir + "/" + output_table["searchspecs"]["id"] + ".json", 'w') as json_file:
json.dump(output_table["searchspecs"], json_file)
touch(output_dir + "/parsed")
return output_table
@@ -217,7 +231,7 @@ def flatten(tables):
# If it fails again, return the original string.
return s
out = dict()
print("{")
#print("{")
for table in tables.keys():
for key in tables[table].keys():
if len(key) < 64:
@@ -228,11 +242,16 @@ def flatten(tables):
fullkeyname = (table + ": " + keyname).replace(".","")
if type(tables[table][key]) is not tuple:
out[fullkeyname] = convert_to_number(tables[table][key])
print("\"" + keyname + "\":", "\"" + str(out[fullkeyname]) + "\",")
#print("\"" + keyname + "\":", "\"" + str(out[fullkeyname]) + "\",")
elif len(tables[table][key]) == 1:
out[fullkeyname] = convert_to_number(tables[table][key][0])
print("\"" + keyname + "\":", "\"" + str(out[fullkeyname]) + "\",")
#print("\"" + keyname + "\":", "\"" + str(out[fullkeyname]) + "\",")
# if the item has at least two commas in it, split it
if tables[table][key].count(',') > 0:
out[fullkeyname] = list(map(lambda x: x.strip(), tables[table][key].split(",")))
#print("\"" + keyname + "\":", "\"" + str(out[fullkeyname]) + "\",")
# if the item has at least two commas in it, split it
if tables[table][key].count(',') > 0:
@@ -240,7 +259,7 @@ def flatten(tables):
print("\"" + keyname + "\":", "\"" + str(out[fullkeyname]) + "\",")
print("}")
#print("}")
return out