Update Alphawire table parsing

This commit is contained in:
Cole Deck 2024-03-14 21:35:26 -05:00
parent 25ceb6c133
commit 39723ec442
2 changed files with 120 additions and 42 deletions

View File

@ -112,12 +112,14 @@ def query_search(partnum, source):
r = requests.get(url=alphaurl) r = requests.get(url=alphaurl)
data = r.json() data = r.json()
output = dict() output = dict()
#print(data) print(data["Results"])
try: try:
if data["Count"] > 0: if data["Count"] > 0:
#print(data["Results"][0]["Url"]) #print(data["Results"][0]["Url"])
for result in data["Results"]: for result in data["Results"]:
if result["Url"].split("/")[-1] == partnum: #print(result["Url"].split("/")[-1], partnum.replace("-", "").replace("/", "_"))
if result["Url"].split("/")[-1] == partnum.replace("-", "").replace("/", "_"):
#print(partnum) #print(partnum)
#print(result["Html"]) #print(result["Html"])
try: try:
@ -133,7 +135,7 @@ def query_search(partnum, source):
dsidx = result["Html"].index("<a href=\"/disteAPI/") + 9 dsidx = result["Html"].index("<a href=\"/disteAPI/") + 9
dsidx2 = result["Html"].index(partnum, dsidx) + len(partnum) dsidx2 = result["Html"].index(partnum, dsidx) + len(partnum)
output["datasheet"] = "https://www.alphawire.com" + result["Html"][dsidx:dsidx2] output["datasheet"] = "https://www.alphawire.com" + result["Html"][dsidx:dsidx2]
output["partnum"] = partnum output["partnum"] = partnum.replace("-", "").replace("/", "_")
#"test".index() #"test".index()
#print(output) #print(output)
return output return output
@ -167,8 +169,9 @@ def get_multi(partnums, delay=0.25):
if r.status_code == 404: if r.status_code == 404:
return False return False
os.makedirs(output_dir, exist_ok=True) os.makedirs(output_dir, exist_ok=True)
bartext = ""
with open(output_dir + "/datasheet.pdf", 'wb') as f: with open(output_dir + "/datasheet.pdf", 'wb') as f:
for chunk in r.iter_content(chunk_size=131072): for chunk in r.iter_content(chunk_size=65536):
# If you have chunk encoded response uncomment if # If you have chunk encoded response uncomment if
# and set chunk_size parameter to None. # and set chunk_size parameter to None.
#if chunk: #if chunk:
@ -195,8 +198,9 @@ def get_multi(partnums, delay=0.25):
if r.status_code == 404: if r.status_code == 404:
return False return False
os.makedirs(output_dir, exist_ok=True) os.makedirs(output_dir, exist_ok=True)
bartext = ""
with open(output_dir + "/datasheet.pdf", 'wb') as f: with open(output_dir + "/datasheet.pdf", 'wb') as f:
for chunk in r.iter_content(chunk_size=131072): for chunk in r.iter_content(chunk_size=65536):
# If you have chunk encoded response uncomment if # If you have chunk encoded response uncomment if
# and set chunk_size parameter to None. # and set chunk_size parameter to None.
#if chunk: #if chunk:
@ -221,8 +225,9 @@ def get_multi(partnums, delay=0.25):
if r.status_code == 404: if r.status_code == 404:
return False return False
os.makedirs(output_dir, exist_ok=True) os.makedirs(output_dir, exist_ok=True)
bartext = ""
with open(output_dir + "/part-hires." + url.split(".")[-1], 'wb') as f: with open(output_dir + "/part-hires." + url.split(".")[-1], 'wb') as f:
for chunk in r.iter_content(chunk_size=131072): for chunk in r.iter_content(chunk_size=65536):
# If you have chunk encoded response uncomment if # If you have chunk encoded response uncomment if
# and set chunk_size parameter to None. # and set chunk_size parameter to None.
#if chunk: #if chunk:
@ -377,7 +382,7 @@ if __name__ == "__main__":
"AW3050", "AW3050",
"AW6714", "AW6714",
"AW1172C", "AW1172C",
"AW2211/4", "AWFIT-221-1/4",
"BLTF-1LF-006-RS5N", "BLTF-1LF-006-RS5N",
"BLTF-SD9-006-RI5N", "BLTF-SD9-006-RI5N",
@ -421,7 +426,7 @@ if __name__ == "__main__":
"BLC6D1100007" "BLC6D1100007"
] ]
#query_search("86104CY", "Alphawire") #print(query_search("FIT-221-1/4", "Alphawire"))
get_multi(partnums, 0.25) get_multi(partnums, 0.25)
#query_search("10GXS13", "Belden") #query_search("10GXS13", "Belden")

View File

@ -19,17 +19,41 @@ def touch(path):
with open(path, 'a'): with open(path, 'a'):
os.utime(path, None) os.utime(path, None)
def extract_table_name(table_start, searchpage, reader, dstype, fallbackname):
if dstype == "Belden":
ymin = table_start
ymax = table_start + 10
elif dstype == "Alphawire":
ymin = table_start - 5
ymax = table_start + 10
page = reader.pages[searchpage - 1]
parts = []
def visitor_body(text, cm, tm, fontDict, fontSize):
y = tm[5]
if y > ymin and y < ymax:
parts.append(text)
page.extract_text(visitor_text=visitor_body)
text_body = "".join(parts).strip('\n')
if len(text_body) == 0:
text_body = str(fallbackname)
return text_body
#fprint(text_body)
def parse(filename, output_dir, partnum, dstype): def parse(filename, output_dir, partnum, dstype):
# Extract table data # Extract table data
if dstype == "Belden":
tables = camelot.read_pdf(filename, pages="1-end", flavor='lattice', backend="poppler", split_text=False, line_scale=100, process_background=True, resolution=600, interations=1, layout_kwargs={'detect_vertical': False, 'char_margin': 0.5}, shift_text=['r', 't']) tables = camelot.read_pdf(filename, pages="1-end", flavor='lattice', backend="poppler", split_text=False, line_scale=100, process_background=True, resolution=600, interations=1, layout_kwargs={'detect_vertical': False, 'char_margin': 0.5}, shift_text=['r', 't'])
elif dstype == "Alphawire":
tables = camelot.read_pdf(filename, pages="1-end", flavor='lattice', backend="poppler", split_text=False, line_scale=50, process_background=True, resolution=600, interations=1, layout_kwargs={'detect_vertical': True, 'char_margin': 0.5}, shift_text=['l', 't'])
#fprint("Total tables extracted:", tables.n) #fprint("Total tables extracted:", tables.n)
n = 0 n = 0
pagenum = 0 #pagenum = 0
reader = PdfReader(filename) reader = PdfReader(filename)
page = reader.pages[0] page = reader.pages[0]
table_list = {} table_list = {}
table_list_raw = {}
for table in tables: for table in tables:
table.df.infer_objects(copy=False) table.df.infer_objects(copy=False)
@ -43,25 +67,13 @@ def parse(filename, output_dir, partnum, dstype):
# Extract table names # Extract table names
table_start = table.cells[0][0].lt[1] # Read top-left cell's top-left coordinate table_start = table.cells[0][0].lt[1] # Read top-left cell's top-left coordinate
#fprint(table_start) #fprint(table_start)
ymin = table_start
ymax = table_start + 10
if pagenum != table.page - 1:
pagenum = table.page - 1
page = reader.pages[table.page - 1]
parts = []
def visitor_body(text, cm, tm, fontDict, fontSize):
y = tm[5]
if y > ymin and y < ymax:
parts.append(text)
page.extract_text(visitor_text=visitor_body)
text_body = "".join(parts).strip('\n')
if len(text_body) == 0:
text_body = str(n)
#fprint(text_body)
text_body = extract_table_name(table_start, table.page, reader, dstype, n)
table_list[text_body] = table.df table_list[text_body] = table.df
if dstype == "Alphawire":
table_list_raw[text_body] = table
#table.to_html("table" + str(n) + ".html") #table.to_html("table" + str(n) + ".html")
#fprint(table.df) #fprint(table.df)
@ -71,7 +83,7 @@ def parse(filename, output_dir, partnum, dstype):
#tables.export(output_dir + '/techdata.json', f='json') #tables.export(output_dir + '/techdata.json', f='json')
# fprint(table_list) #fprint(table_list)
# Extract Basic details - part name & description, image, etc # Extract Basic details - part name & description, image, etc
reader = PdfReader(filename) reader = PdfReader(filename)
@ -100,24 +112,32 @@ def parse(filename, output_dir, partnum, dstype):
tables = dict() tables = dict()
torename = dict() torename = dict()
previous_table = "" previous_table = ""
print(table_list.keys())
for table_name in table_list.keys(): for table_name in table_list.keys():
# determine shape: horizontal or vertical # determine shape: horizontal or vertical
table = table_list[table_name] table = table_list[table_name]
rows = table.shape[0] rows = table.shape[0]
cols = table.shape[1] cols = table.shape[1]
vertical = None vertical = None
print(rows, cols, table_name)
if rows > 2 and cols == 2: if rows > 2 and cols == 2:
vertical = True vertical = True
elif cols == 1: elif cols == 1 and rows > 1:
vertical = False vertical = False
elif rows == 1: elif rows == 1:
vertical = True vertical = True
elif cols == 2: # and rows <= 2 elif cols == 2: # and rows <= 2
# inconsistent # inconsistent
if table.iloc[0, 0].find(":") == len(table.iloc[0, 0]) - 1: # check if last character is ":" indicating a vertical table if dstype == "Belden":
vertical = True if table.iloc[0, 0].find(":") == len(table.iloc[0, 0]) - 1: # check if last character is ":" indicating a vertical table
else: vertical = True
vertical = False else:
vertical = False
elif dstype == "Alphawire":
if table.iloc[0, 0].find(")") == 1 or table.iloc[0, 0].find(")") == 2 or table.iloc[0, 0].find(":") == len(table.iloc[0, 0]) - 1: # check if last character is ":" indicating a vertical table
vertical = True
else:
vertical = False
elif cols > 2: # and rows <= 2 elif cols > 2: # and rows <= 2
vertical = False vertical = False
@ -125,9 +145,12 @@ def parse(filename, output_dir, partnum, dstype):
vertical = False vertical = False
else: # 1 column, <= 2 rows else: # 1 column, <= 2 rows
vertical = False vertical = False
print(vertical)
# missing name check # missing name check
for table_name_2 in table_list.keys(): for table_name_2 in table_list.keys():
if dstype == "Alphawire" and table_name_2.find("\n") >= 0:
torename[table_name_2] = table_name_2[0:table_name_2.find("\n")]
if table_name_2.find(table.iloc[-1, 0]) >= 0: if table_name_2.find(table.iloc[-1, 0]) >= 0:
# Name taken from table directly above - this table does not have a name # Name taken from table directly above - this table does not have a name
torename[table_name_2] = "Specs " + str(len(tables)) torename[table_name_2] = "Specs " + str(len(tables))
@ -136,8 +159,12 @@ def parse(filename, output_dir, partnum, dstype):
if vertical: if vertical:
out = dict() out = dict()
for row in table.itertuples(index=False, name=None): if rows > 1:
out[row[0].replace("\n", " ").replace(":", "")] = row[1] for row in table.itertuples(index=False, name=None):
out[row[0].replace("\n", " ").replace(":", "")] = row[1]
else:
for row in table.itertuples(index=False, name=None):
out[row[0].replace("\n", " ").replace(":", "")] = ""
else: # horizontal else: # horizontal
out = dict() out = dict()
@ -147,9 +174,52 @@ def parse(filename, output_dir, partnum, dstype):
tables[table_name] = out tables[table_name] = out
# multi-page table check, Alphawire
if dstype == "Alphawire" and table_name.isdigit():
# table continues from previous page or has name on previous page
thistbl = table_list_raw[table_name]
prevtbl = table_list_raw[previous_table]
if prevtbl.cells[-1][0].lb[1] < 50 and thistbl.cells[0][0].lt[1] > 600:
# wraparound
print("WRAP")
print("PREV TABLE", prevtbl.df)
print("THIS TABLE", thistbl.df)
print("PREV TABLE CORNER", prevtbl.cells[-1][0].lb[1])
print("THIS TABLE CORNER", thistbl.cells[0][0].lt[1])
main_key = previous_table
cont_key = table_name
#print(vertical)
if vertical == False:
main_keys = list(tables[main_key].keys())
for i, (cont_key, cont_values) in enumerate(tables[cont_key].items()):
if i < len(main_keys):
#print(tables[main_key][main_keys[i]])
tables[main_key][main_keys[i]] = (tuple(tables[main_key][main_keys[i]]) + (cont_key,) + cont_values)
del tables[table_name]
else:
print(tables[cont_key].keys())
for key in tables[cont_key].keys():
print(main_key, key, cont_key, key)
tables[main_key][key] = tables[cont_key][key]
del tables[table_name]
elif thistbl.cells[0][0].lt[1] > 600:
# name on previous page (grrrr)
print("NAMEABOVE")
print("PREV TABLE", prevtbl.df)
print("THIS TABLE", thistbl.df)
print("PREV TABLE CORNER", prevtbl.cells[-1][0].lb[1])
print("THIS TABLE CORNER", thistbl.cells[0][0].lt[1])
name = extract_table_name(50, prevtbl.page,reader,dstype,table_name).strip("\n").strip()
print("FOUND NAME:", name)
torename[table_name] = name
# multi-page table check
# multi-page table check, Belden
if dstype == "Belden": if dstype == "Belden":
if table_name.isdigit() and len(tables) > 1: if table_name.isdigit() and len(tables) > 1:
#fprint(table_name) #fprint(table_name)
@ -212,9 +282,13 @@ def parse(filename, output_dir, partnum, dstype):
#print(output_table) #print(output_table)
run_cmd("rm \"" + output_dir + "\"/*.json") # not reliable! run_cmd("rm \"" + output_dir + "\"/*.json") # not reliable!
with open(output_dir + "/" + output_table["searchspecs"]["id"] + ".json", 'w') as json_file: with open(output_dir + "/search_" + output_table["searchspecs"]["id"] + ".json", 'w') as json_file:
json.dump(output_table["searchspecs"], json_file) json.dump(output_table["searchspecs"], json_file)
touch(output_dir + "/parsed") with open(output_dir + "/specs_" + output_table["partnum"] + ".json", 'w') as json_file:
json.dump(output_table["fullspecs"], json_file)
print(json.dumps(output_table, indent=2))
touch(output_dir + "/parsed") # mark as parsed
return output_table return output_table
@ -245,7 +319,6 @@ def flatten(tables):
#print("\"" + keyname + "\":", "\"" + str(out[fullkeyname]) + "\",") #print("\"" + keyname + "\":", "\"" + str(out[fullkeyname]) + "\",")
elif len(tables[table][key]) == 1: elif len(tables[table][key]) == 1:
out[fullkeyname] = convert_to_number(tables[table][key][0]) out[fullkeyname] = convert_to_number(tables[table][key][0])
#print("\"" + keyname + "\":", "\"" + str(out[fullkeyname]) + "\",") #print("\"" + keyname + "\":", "\"" + str(out[fullkeyname]) + "\",")
# if the item has at least two commas in it, split it # if the item has at least two commas in it, split it
@ -256,7 +329,7 @@ def flatten(tables):
# if the item has at least two commas in it, split it # if the item has at least two commas in it, split it
if tables[table][key].count(',') > 0: if tables[table][key].count(',') > 0:
out[fullkeyname] = list(map(lambda x: x.strip(), tables[table][key].split(","))) out[fullkeyname] = list(map(lambda x: x.strip(), tables[table][key].split(",")))
print("\"" + keyname + "\":", "\"" + str(out[fullkeyname]) + "\",") #print("\"" + keyname + "\":", "\"" + str(out[fullkeyname]) + "\",")
#print("}") #print("}")
@ -265,4 +338,4 @@ def flatten(tables):
if __name__ == "__main__": if __name__ == "__main__":
parse("test2.pdf", "cables/10GXS13", "10GXS13") parse("cables/3050/datasheet.pdf", "cables/3050", "3050", "Alphawire")