Merge branch 'main' into dthomas_meilisearch
# Conflicts: # .gitignore # read_datasheet.py
This commit is contained in:
@@ -2,6 +2,8 @@
|
||||
|
||||
# Parse Belden catalog techdata datasheets
|
||||
|
||||
import pandas as pd
|
||||
pd.set_option('future.no_silent_downcasting', True)
|
||||
from PyPDF2 import PdfReader
|
||||
import camelot
|
||||
import numpy as np
|
||||
@@ -11,6 +13,11 @@ import json
|
||||
from util import fprint
|
||||
import uuid
|
||||
from util import run_cmd
|
||||
import os
|
||||
|
||||
def touch(path):
|
||||
with open(path, 'a'):
|
||||
os.utime(path, None)
|
||||
|
||||
def parse(filename, output_dir, partnum, dstype):
|
||||
|
||||
@@ -23,6 +30,7 @@ def parse(filename, output_dir, partnum, dstype):
|
||||
reader = PdfReader(filename)
|
||||
page = reader.pages[0]
|
||||
table_list = {}
|
||||
|
||||
for table in tables:
|
||||
table.df.infer_objects(copy=False)
|
||||
table.df.replace('', np.nan, inplace=True)
|
||||
@@ -90,6 +98,7 @@ def parse(filename, output_dir, partnum, dstype):
|
||||
|
||||
# Table parsing and reordring
|
||||
tables = dict()
|
||||
torename = dict()
|
||||
previous_table = ""
|
||||
for table_name in table_list.keys():
|
||||
# determine shape: horizontal or vertical
|
||||
@@ -121,7 +130,8 @@ def parse(filename, output_dir, partnum, dstype):
|
||||
for table_name_2 in table_list.keys():
|
||||
if table_name_2.find(table.iloc[-1, 0]) >= 0:
|
||||
# Name taken from table directly above - this table does not have a name
|
||||
table_list["Specs " + str(len(tables))] = table_list.pop(table_name_2, None) # rename table to arbitrary altername name
|
||||
torename[table_name_2] = "Specs " + str(len(tables))
|
||||
#table_list["Specs " + str(len(tables))] = table_list[table_name_2] # rename table to arbitrary altername name
|
||||
break
|
||||
|
||||
if vertical:
|
||||
@@ -142,21 +152,21 @@ def parse(filename, output_dir, partnum, dstype):
|
||||
# multi-page table check
|
||||
if dstype == "Belden":
|
||||
if table_name.isdigit() and len(tables) > 1:
|
||||
fprint(table_name)
|
||||
fprint(previous_table)
|
||||
#fprint(table_name)
|
||||
#fprint(previous_table)
|
||||
|
||||
|
||||
|
||||
|
||||
main_key = previous_table
|
||||
cont_key = table_name
|
||||
fprint(tables)
|
||||
#fprint(tables)
|
||||
if vertical == False:
|
||||
main_keys = list(tables[main_key].keys())
|
||||
for i, (cont_key, cont_values) in enumerate(tables[cont_key].items()):
|
||||
if i < len(main_keys):
|
||||
fprint(tables[main_key][main_keys[i]])
|
||||
tables[main_key][main_keys[i]] = (tables[main_key][main_keys[i]] + (cont_key,) + cont_values)
|
||||
#fprint(tables[main_key][main_keys[i]])
|
||||
tables[main_key][main_keys[i]] = (tuple(tables[main_key][main_keys[i]]) + (cont_key,) + cont_values)
|
||||
|
||||
del tables[table_name]
|
||||
|
||||
@@ -167,6 +177,10 @@ def parse(filename, output_dir, partnum, dstype):
|
||||
|
||||
previous_table = table_name
|
||||
|
||||
# remove renamed tables
|
||||
for table_name in torename.keys():
|
||||
tables[torename[table_name]] = tables[table_name]
|
||||
del tables[table_name]
|
||||
# remove multi-line values that occasionally squeak through
|
||||
def replace_newlines_in_dict(d):
|
||||
for key, value in d.items():
|
||||
@@ -195,12 +209,12 @@ def parse(filename, output_dir, partnum, dstype):
|
||||
|
||||
|
||||
|
||||
print(output_table)
|
||||
#print(output_table)
|
||||
|
||||
run_cmd("rm " + output_dir + "/*.json") # not reliable!
|
||||
run_cmd("rm \"" + output_dir + "\"/*.json") # not reliable!
|
||||
with open(output_dir + "/" + output_table["searchspecs"]["id"] + ".json", 'w') as json_file:
|
||||
json.dump(output_table["searchspecs"], json_file)
|
||||
|
||||
touch(output_dir + "/parsed")
|
||||
return output_table
|
||||
|
||||
|
||||
@@ -217,7 +231,7 @@ def flatten(tables):
|
||||
# If it fails again, return the original string.
|
||||
return s
|
||||
out = dict()
|
||||
print("{")
|
||||
#print("{")
|
||||
for table in tables.keys():
|
||||
for key in tables[table].keys():
|
||||
if len(key) < 64:
|
||||
@@ -228,11 +242,16 @@ def flatten(tables):
|
||||
fullkeyname = (table + ": " + keyname).replace(".","")
|
||||
if type(tables[table][key]) is not tuple:
|
||||
out[fullkeyname] = convert_to_number(tables[table][key])
|
||||
print("\"" + keyname + "\":", "\"" + str(out[fullkeyname]) + "\",")
|
||||
#print("\"" + keyname + "\":", "\"" + str(out[fullkeyname]) + "\",")
|
||||
elif len(tables[table][key]) == 1:
|
||||
out[fullkeyname] = convert_to_number(tables[table][key][0])
|
||||
|
||||
print("\"" + keyname + "\":", "\"" + str(out[fullkeyname]) + "\",")
|
||||
#print("\"" + keyname + "\":", "\"" + str(out[fullkeyname]) + "\",")
|
||||
|
||||
# if the item has at least two commas in it, split it
|
||||
if tables[table][key].count(',') > 0:
|
||||
out[fullkeyname] = list(map(lambda x: x.strip(), tables[table][key].split(",")))
|
||||
#print("\"" + keyname + "\":", "\"" + str(out[fullkeyname]) + "\",")
|
||||
|
||||
# if the item has at least two commas in it, split it
|
||||
if tables[table][key].count(',') > 0:
|
||||
@@ -240,7 +259,7 @@ def flatten(tables):
|
||||
print("\"" + keyname + "\":", "\"" + str(out[fullkeyname]) + "\",")
|
||||
|
||||
|
||||
print("}")
|
||||
#print("}")
|
||||
return out
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user