Update Alphawire table parsing
This commit is contained in:
parent
25ceb6c133
commit
39723ec442
21
get_specs.py
21
get_specs.py
@ -112,12 +112,14 @@ def query_search(partnum, source):
|
||||
r = requests.get(url=alphaurl)
|
||||
data = r.json()
|
||||
output = dict()
|
||||
#print(data)
|
||||
print(data["Results"])
|
||||
|
||||
try:
|
||||
if data["Count"] > 0:
|
||||
#print(data["Results"][0]["Url"])
|
||||
for result in data["Results"]:
|
||||
if result["Url"].split("/")[-1] == partnum:
|
||||
#print(result["Url"].split("/")[-1], partnum.replace("-", "").replace("/", "_"))
|
||||
if result["Url"].split("/")[-1] == partnum.replace("-", "").replace("/", "_"):
|
||||
#print(partnum)
|
||||
#print(result["Html"])
|
||||
try:
|
||||
@ -133,7 +135,7 @@ def query_search(partnum, source):
|
||||
dsidx = result["Html"].index("<a href=\"/disteAPI/") + 9
|
||||
dsidx2 = result["Html"].index(partnum, dsidx) + len(partnum)
|
||||
output["datasheet"] = "https://www.alphawire.com" + result["Html"][dsidx:dsidx2]
|
||||
output["partnum"] = partnum
|
||||
output["partnum"] = partnum.replace("-", "").replace("/", "_")
|
||||
#"test".index()
|
||||
#print(output)
|
||||
return output
|
||||
@ -167,8 +169,9 @@ def get_multi(partnums, delay=0.25):
|
||||
if r.status_code == 404:
|
||||
return False
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
bartext = ""
|
||||
with open(output_dir + "/datasheet.pdf", 'wb') as f:
|
||||
for chunk in r.iter_content(chunk_size=131072):
|
||||
for chunk in r.iter_content(chunk_size=65536):
|
||||
# If you have chunk encoded response uncomment if
|
||||
# and set chunk_size parameter to None.
|
||||
#if chunk:
|
||||
@ -195,8 +198,9 @@ def get_multi(partnums, delay=0.25):
|
||||
if r.status_code == 404:
|
||||
return False
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
bartext = ""
|
||||
with open(output_dir + "/datasheet.pdf", 'wb') as f:
|
||||
for chunk in r.iter_content(chunk_size=131072):
|
||||
for chunk in r.iter_content(chunk_size=65536):
|
||||
# If you have chunk encoded response uncomment if
|
||||
# and set chunk_size parameter to None.
|
||||
#if chunk:
|
||||
@ -221,8 +225,9 @@ def get_multi(partnums, delay=0.25):
|
||||
if r.status_code == 404:
|
||||
return False
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
bartext = ""
|
||||
with open(output_dir + "/part-hires." + url.split(".")[-1], 'wb') as f:
|
||||
for chunk in r.iter_content(chunk_size=131072):
|
||||
for chunk in r.iter_content(chunk_size=65536):
|
||||
# If you have chunk encoded response uncomment if
|
||||
# and set chunk_size parameter to None.
|
||||
#if chunk:
|
||||
@ -377,7 +382,7 @@ if __name__ == "__main__":
|
||||
"AW3050",
|
||||
"AW6714",
|
||||
"AW1172C",
|
||||
"AW2211/4",
|
||||
"AWFIT-221-1/4",
|
||||
|
||||
"BLTF-1LF-006-RS5N",
|
||||
"BLTF-SD9-006-RI5N",
|
||||
@ -421,7 +426,7 @@ if __name__ == "__main__":
|
||||
"BLC6D1100007"
|
||||
|
||||
]
|
||||
#query_search("86104CY", "Alphawire")
|
||||
#print(query_search("FIT-221-1/4", "Alphawire"))
|
||||
get_multi(partnums, 0.25)
|
||||
#query_search("10GXS13", "Belden")
|
||||
|
||||
|
@ -19,17 +19,41 @@ def touch(path):
|
||||
with open(path, 'a'):
|
||||
os.utime(path, None)
|
||||
|
||||
def extract_table_name(table_start, searchpage, reader, dstype, fallbackname):
|
||||
if dstype == "Belden":
|
||||
ymin = table_start
|
||||
ymax = table_start + 10
|
||||
elif dstype == "Alphawire":
|
||||
ymin = table_start - 5
|
||||
ymax = table_start + 10
|
||||
page = reader.pages[searchpage - 1]
|
||||
parts = []
|
||||
def visitor_body(text, cm, tm, fontDict, fontSize):
|
||||
y = tm[5]
|
||||
if y > ymin and y < ymax:
|
||||
parts.append(text)
|
||||
page.extract_text(visitor_text=visitor_body)
|
||||
text_body = "".join(parts).strip('\n')
|
||||
if len(text_body) == 0:
|
||||
text_body = str(fallbackname)
|
||||
|
||||
return text_body
|
||||
#fprint(text_body)
|
||||
|
||||
def parse(filename, output_dir, partnum, dstype):
|
||||
|
||||
# Extract table data
|
||||
|
||||
if dstype == "Belden":
|
||||
tables = camelot.read_pdf(filename, pages="1-end", flavor='lattice', backend="poppler", split_text=False, line_scale=100, process_background=True, resolution=600, interations=1, layout_kwargs={'detect_vertical': False, 'char_margin': 0.5}, shift_text=['r', 't'])
|
||||
elif dstype == "Alphawire":
|
||||
tables = camelot.read_pdf(filename, pages="1-end", flavor='lattice', backend="poppler", split_text=False, line_scale=50, process_background=True, resolution=600, interations=1, layout_kwargs={'detect_vertical': True, 'char_margin': 0.5}, shift_text=['l', 't'])
|
||||
#fprint("Total tables extracted:", tables.n)
|
||||
n = 0
|
||||
pagenum = 0
|
||||
#pagenum = 0
|
||||
reader = PdfReader(filename)
|
||||
page = reader.pages[0]
|
||||
table_list = {}
|
||||
table_list_raw = {}
|
||||
|
||||
for table in tables:
|
||||
table.df.infer_objects(copy=False)
|
||||
@ -43,25 +67,13 @@ def parse(filename, output_dir, partnum, dstype):
|
||||
# Extract table names
|
||||
table_start = table.cells[0][0].lt[1] # Read top-left cell's top-left coordinate
|
||||
#fprint(table_start)
|
||||
ymin = table_start
|
||||
ymax = table_start + 10
|
||||
if pagenum != table.page - 1:
|
||||
pagenum = table.page - 1
|
||||
page = reader.pages[table.page - 1]
|
||||
parts = []
|
||||
def visitor_body(text, cm, tm, fontDict, fontSize):
|
||||
y = tm[5]
|
||||
if y > ymin and y < ymax:
|
||||
parts.append(text)
|
||||
|
||||
page.extract_text(visitor_text=visitor_body)
|
||||
text_body = "".join(parts).strip('\n')
|
||||
if len(text_body) == 0:
|
||||
text_body = str(n)
|
||||
#fprint(text_body)
|
||||
|
||||
text_body = extract_table_name(table_start, table.page, reader, dstype, n)
|
||||
|
||||
table_list[text_body] = table.df
|
||||
if dstype == "Alphawire":
|
||||
table_list_raw[text_body] = table
|
||||
|
||||
#table.to_html("table" + str(n) + ".html")
|
||||
|
||||
#fprint(table.df)
|
||||
@ -100,24 +112,32 @@ def parse(filename, output_dir, partnum, dstype):
|
||||
tables = dict()
|
||||
torename = dict()
|
||||
previous_table = ""
|
||||
print(table_list.keys())
|
||||
for table_name in table_list.keys():
|
||||
# determine shape: horizontal or vertical
|
||||
table = table_list[table_name]
|
||||
rows = table.shape[0]
|
||||
cols = table.shape[1]
|
||||
vertical = None
|
||||
print(rows, cols, table_name)
|
||||
if rows > 2 and cols == 2:
|
||||
vertical = True
|
||||
elif cols == 1:
|
||||
elif cols == 1 and rows > 1:
|
||||
vertical = False
|
||||
elif rows == 1:
|
||||
vertical = True
|
||||
elif cols == 2: # and rows <= 2
|
||||
# inconsistent
|
||||
if dstype == "Belden":
|
||||
if table.iloc[0, 0].find(":") == len(table.iloc[0, 0]) - 1: # check if last character is ":" indicating a vertical table
|
||||
vertical = True
|
||||
else:
|
||||
vertical = False
|
||||
elif dstype == "Alphawire":
|
||||
if table.iloc[0, 0].find(")") == 1 or table.iloc[0, 0].find(")") == 2 or table.iloc[0, 0].find(":") == len(table.iloc[0, 0]) - 1: # check if last character is ":" indicating a vertical table
|
||||
vertical = True
|
||||
else:
|
||||
vertical = False
|
||||
|
||||
elif cols > 2: # and rows <= 2
|
||||
vertical = False
|
||||
@ -125,9 +145,12 @@ def parse(filename, output_dir, partnum, dstype):
|
||||
vertical = False
|
||||
else: # 1 column, <= 2 rows
|
||||
vertical = False
|
||||
|
||||
print(vertical)
|
||||
# missing name check
|
||||
for table_name_2 in table_list.keys():
|
||||
if dstype == "Alphawire" and table_name_2.find("\n") >= 0:
|
||||
torename[table_name_2] = table_name_2[0:table_name_2.find("\n")]
|
||||
|
||||
if table_name_2.find(table.iloc[-1, 0]) >= 0:
|
||||
# Name taken from table directly above - this table does not have a name
|
||||
torename[table_name_2] = "Specs " + str(len(tables))
|
||||
@ -136,8 +159,12 @@ def parse(filename, output_dir, partnum, dstype):
|
||||
|
||||
if vertical:
|
||||
out = dict()
|
||||
if rows > 1:
|
||||
for row in table.itertuples(index=False, name=None):
|
||||
out[row[0].replace("\n", " ").replace(":", "")] = row[1]
|
||||
else:
|
||||
for row in table.itertuples(index=False, name=None):
|
||||
out[row[0].replace("\n", " ").replace(":", "")] = ""
|
||||
|
||||
else: # horizontal
|
||||
out = dict()
|
||||
@ -147,9 +174,52 @@ def parse(filename, output_dir, partnum, dstype):
|
||||
|
||||
tables[table_name] = out
|
||||
|
||||
# multi-page table check, Alphawire
|
||||
if dstype == "Alphawire" and table_name.isdigit():
|
||||
# table continues from previous page or has name on previous page
|
||||
thistbl = table_list_raw[table_name]
|
||||
prevtbl = table_list_raw[previous_table]
|
||||
|
||||
if prevtbl.cells[-1][0].lb[1] < 50 and thistbl.cells[0][0].lt[1] > 600:
|
||||
# wraparound
|
||||
print("WRAP")
|
||||
print("PREV TABLE", prevtbl.df)
|
||||
print("THIS TABLE", thistbl.df)
|
||||
print("PREV TABLE CORNER", prevtbl.cells[-1][0].lb[1])
|
||||
print("THIS TABLE CORNER", thistbl.cells[0][0].lt[1])
|
||||
main_key = previous_table
|
||||
cont_key = table_name
|
||||
#print(vertical)
|
||||
if vertical == False:
|
||||
main_keys = list(tables[main_key].keys())
|
||||
for i, (cont_key, cont_values) in enumerate(tables[cont_key].items()):
|
||||
if i < len(main_keys):
|
||||
#print(tables[main_key][main_keys[i]])
|
||||
tables[main_key][main_keys[i]] = (tuple(tables[main_key][main_keys[i]]) + (cont_key,) + cont_values)
|
||||
|
||||
del tables[table_name]
|
||||
|
||||
else:
|
||||
print(tables[cont_key].keys())
|
||||
for key in tables[cont_key].keys():
|
||||
print(main_key, key, cont_key, key)
|
||||
tables[main_key][key] = tables[cont_key][key]
|
||||
del tables[table_name]
|
||||
|
||||
elif thistbl.cells[0][0].lt[1] > 600:
|
||||
# name on previous page (grrrr)
|
||||
print("NAMEABOVE")
|
||||
print("PREV TABLE", prevtbl.df)
|
||||
print("THIS TABLE", thistbl.df)
|
||||
print("PREV TABLE CORNER", prevtbl.cells[-1][0].lb[1])
|
||||
print("THIS TABLE CORNER", thistbl.cells[0][0].lt[1])
|
||||
name = extract_table_name(50, prevtbl.page,reader,dstype,table_name).strip("\n").strip()
|
||||
print("FOUND NAME:", name)
|
||||
torename[table_name] = name
|
||||
|
||||
|
||||
# multi-page table check
|
||||
|
||||
# multi-page table check, Belden
|
||||
if dstype == "Belden":
|
||||
if table_name.isdigit() and len(tables) > 1:
|
||||
#fprint(table_name)
|
||||
@ -212,9 +282,13 @@ def parse(filename, output_dir, partnum, dstype):
|
||||
#print(output_table)
|
||||
|
||||
run_cmd("rm \"" + output_dir + "\"/*.json") # not reliable!
|
||||
with open(output_dir + "/" + output_table["searchspecs"]["id"] + ".json", 'w') as json_file:
|
||||
with open(output_dir + "/search_" + output_table["searchspecs"]["id"] + ".json", 'w') as json_file:
|
||||
json.dump(output_table["searchspecs"], json_file)
|
||||
touch(output_dir + "/parsed")
|
||||
with open(output_dir + "/specs_" + output_table["partnum"] + ".json", 'w') as json_file:
|
||||
json.dump(output_table["fullspecs"], json_file)
|
||||
|
||||
print(json.dumps(output_table, indent=2))
|
||||
touch(output_dir + "/parsed") # mark as parsed
|
||||
return output_table
|
||||
|
||||
|
||||
@ -245,7 +319,6 @@ def flatten(tables):
|
||||
#print("\"" + keyname + "\":", "\"" + str(out[fullkeyname]) + "\",")
|
||||
elif len(tables[table][key]) == 1:
|
||||
out[fullkeyname] = convert_to_number(tables[table][key][0])
|
||||
|
||||
#print("\"" + keyname + "\":", "\"" + str(out[fullkeyname]) + "\",")
|
||||
|
||||
# if the item has at least two commas in it, split it
|
||||
@ -256,7 +329,7 @@ def flatten(tables):
|
||||
# if the item has at least two commas in it, split it
|
||||
if tables[table][key].count(',') > 0:
|
||||
out[fullkeyname] = list(map(lambda x: x.strip(), tables[table][key].split(",")))
|
||||
print("\"" + keyname + "\":", "\"" + str(out[fullkeyname]) + "\",")
|
||||
#print("\"" + keyname + "\":", "\"" + str(out[fullkeyname]) + "\",")
|
||||
|
||||
|
||||
#print("}")
|
||||
@ -265,4 +338,4 @@ def flatten(tables):
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parse("test2.pdf", "cables/10GXS13", "10GXS13")
|
||||
parse("cables/3050/datasheet.pdf", "cables/3050", "3050", "Alphawire")
|
Loading…
x
Reference in New Issue
Block a user