diff --git a/get_specs.py b/get_specs.py index 17f3801..7c16010 100755 --- a/get_specs.py +++ b/get_specs.py @@ -112,12 +112,14 @@ def query_search(partnum, source): r = requests.get(url=alphaurl) data = r.json() output = dict() - #print(data) + print(data["Results"]) + try: if data["Count"] > 0: #print(data["Results"][0]["Url"]) for result in data["Results"]: - if result["Url"].split("/")[-1] == partnum: + #print(result["Url"].split("/")[-1], partnum.replace("-", "").replace("/", "_")) + if result["Url"].split("/")[-1] == partnum.replace("-", "").replace("/", "_"): #print(partnum) #print(result["Html"]) try: @@ -133,7 +135,7 @@ def query_search(partnum, source): dsidx = result["Html"].index(" ymin and y < ymax: + parts.append(text) + page.extract_text(visitor_text=visitor_body) + text_body = "".join(parts).strip('\n') + if len(text_body) == 0: + text_body = str(fallbackname) + + return text_body + #fprint(text_body) + def parse(filename, output_dir, partnum, dstype): # Extract table data - - tables = camelot.read_pdf(filename, pages="1-end", flavor='lattice', backend="poppler", split_text=False, line_scale=100, process_background=True, resolution=600, interations=1, layout_kwargs={'detect_vertical': False, 'char_margin': 0.5}, shift_text=['r', 't']) + if dstype == "Belden": + tables = camelot.read_pdf(filename, pages="1-end", flavor='lattice', backend="poppler", split_text=False, line_scale=100, process_background=True, resolution=600, interations=1, layout_kwargs={'detect_vertical': False, 'char_margin': 0.5}, shift_text=['r', 't']) + elif dstype == "Alphawire": + tables = camelot.read_pdf(filename, pages="1-end", flavor='lattice', backend="poppler", split_text=False, line_scale=50, process_background=True, resolution=600, interations=1, layout_kwargs={'detect_vertical': True, 'char_margin': 0.5}, shift_text=['l', 't']) #fprint("Total tables extracted:", tables.n) n = 0 - pagenum = 0 + #pagenum = 0 reader = PdfReader(filename) page = reader.pages[0] table_list = {} + table_list_raw = {} for table in tables: table.df.infer_objects(copy=False) @@ -43,25 +67,13 @@ def parse(filename, output_dir, partnum, dstype): # Extract table names table_start = table.cells[0][0].lt[1] # Read top-left cell's top-left coordinate #fprint(table_start) - ymin = table_start - ymax = table_start + 10 - if pagenum != table.page - 1: - pagenum = table.page - 1 - page = reader.pages[table.page - 1] - parts = [] - def visitor_body(text, cm, tm, fontDict, fontSize): - y = tm[5] - if y > ymin and y < ymax: - parts.append(text) - - page.extract_text(visitor_text=visitor_body) - text_body = "".join(parts).strip('\n') - if len(text_body) == 0: - text_body = str(n) - #fprint(text_body) + text_body = extract_table_name(table_start, table.page, reader, dstype, n) table_list[text_body] = table.df + if dstype == "Alphawire": + table_list_raw[text_body] = table + #table.to_html("table" + str(n) + ".html") #fprint(table.df) @@ -71,7 +83,7 @@ def parse(filename, output_dir, partnum, dstype): #tables.export(output_dir + '/techdata.json', f='json') - # fprint(table_list) + #fprint(table_list) # Extract Basic details - part name & description, image, etc reader = PdfReader(filename) @@ -100,24 +112,32 @@ def parse(filename, output_dir, partnum, dstype): tables = dict() torename = dict() previous_table = "" + print(table_list.keys()) for table_name in table_list.keys(): # determine shape: horizontal or vertical table = table_list[table_name] rows = table.shape[0] cols = table.shape[1] vertical = None + print(rows, cols, table_name) if rows > 2 and cols == 2: vertical = True - elif cols == 1: + elif cols == 1 and rows > 1: vertical = False elif rows == 1: vertical = True elif cols == 2: # and rows <= 2 # inconsistent - if table.iloc[0, 0].find(":") == len(table.iloc[0, 0]) - 1: # check if last character is ":" indicating a vertical table - vertical = True - else: - vertical = False + if dstype == "Belden": + if table.iloc[0, 0].find(":") == len(table.iloc[0, 0]) - 1: # check if last character is ":" indicating a vertical table + vertical = True + else: + vertical = False + elif dstype == "Alphawire": + if table.iloc[0, 0].find(")") == 1 or table.iloc[0, 0].find(")") == 2 or table.iloc[0, 0].find(":") == len(table.iloc[0, 0]) - 1: # check if last character is ":" indicating a vertical table + vertical = True + else: + vertical = False elif cols > 2: # and rows <= 2 vertical = False @@ -125,9 +145,12 @@ def parse(filename, output_dir, partnum, dstype): vertical = False else: # 1 column, <= 2 rows vertical = False - + print(vertical) # missing name check for table_name_2 in table_list.keys(): + if dstype == "Alphawire" and table_name_2.find("\n") >= 0: + torename[table_name_2] = table_name_2[0:table_name_2.find("\n")] + if table_name_2.find(table.iloc[-1, 0]) >= 0: # Name taken from table directly above - this table does not have a name torename[table_name_2] = "Specs " + str(len(tables)) @@ -136,8 +159,12 @@ def parse(filename, output_dir, partnum, dstype): if vertical: out = dict() - for row in table.itertuples(index=False, name=None): - out[row[0].replace("\n", " ").replace(":", "")] = row[1] + if rows > 1: + for row in table.itertuples(index=False, name=None): + out[row[0].replace("\n", " ").replace(":", "")] = row[1] + else: + for row in table.itertuples(index=False, name=None): + out[row[0].replace("\n", " ").replace(":", "")] = "" else: # horizontal out = dict() @@ -147,9 +174,52 @@ def parse(filename, output_dir, partnum, dstype): tables[table_name] = out + # multi-page table check, Alphawire + if dstype == "Alphawire" and table_name.isdigit(): + # table continues from previous page or has name on previous page + thistbl = table_list_raw[table_name] + prevtbl = table_list_raw[previous_table] + + if prevtbl.cells[-1][0].lb[1] < 50 and thistbl.cells[0][0].lt[1] > 600: + # wraparound + print("WRAP") + print("PREV TABLE", prevtbl.df) + print("THIS TABLE", thistbl.df) + print("PREV TABLE CORNER", prevtbl.cells[-1][0].lb[1]) + print("THIS TABLE CORNER", thistbl.cells[0][0].lt[1]) + main_key = previous_table + cont_key = table_name + #print(vertical) + if vertical == False: + main_keys = list(tables[main_key].keys()) + for i, (cont_key, cont_values) in enumerate(tables[cont_key].items()): + if i < len(main_keys): + #print(tables[main_key][main_keys[i]]) + tables[main_key][main_keys[i]] = (tuple(tables[main_key][main_keys[i]]) + (cont_key,) + cont_values) + + del tables[table_name] + + else: + print(tables[cont_key].keys()) + for key in tables[cont_key].keys(): + print(main_key, key, cont_key, key) + tables[main_key][key] = tables[cont_key][key] + del tables[table_name] + + elif thistbl.cells[0][0].lt[1] > 600: + # name on previous page (grrrr) + print("NAMEABOVE") + print("PREV TABLE", prevtbl.df) + print("THIS TABLE", thistbl.df) + print("PREV TABLE CORNER", prevtbl.cells[-1][0].lb[1]) + print("THIS TABLE CORNER", thistbl.cells[0][0].lt[1]) + name = extract_table_name(50, prevtbl.page,reader,dstype,table_name).strip("\n").strip() + print("FOUND NAME:", name) + torename[table_name] = name - # multi-page table check + + # multi-page table check, Belden if dstype == "Belden": if table_name.isdigit() and len(tables) > 1: #fprint(table_name) @@ -212,9 +282,13 @@ def parse(filename, output_dir, partnum, dstype): #print(output_table) run_cmd("rm \"" + output_dir + "\"/*.json") # not reliable! - with open(output_dir + "/" + output_table["searchspecs"]["id"] + ".json", 'w') as json_file: + with open(output_dir + "/search_" + output_table["searchspecs"]["id"] + ".json", 'w') as json_file: json.dump(output_table["searchspecs"], json_file) - touch(output_dir + "/parsed") + with open(output_dir + "/specs_" + output_table["partnum"] + ".json", 'w') as json_file: + json.dump(output_table["fullspecs"], json_file) + + print(json.dumps(output_table, indent=2)) + touch(output_dir + "/parsed") # mark as parsed return output_table @@ -245,7 +319,6 @@ def flatten(tables): #print("\"" + keyname + "\":", "\"" + str(out[fullkeyname]) + "\",") elif len(tables[table][key]) == 1: out[fullkeyname] = convert_to_number(tables[table][key][0]) - #print("\"" + keyname + "\":", "\"" + str(out[fullkeyname]) + "\",") # if the item has at least two commas in it, split it @@ -256,7 +329,7 @@ def flatten(tables): # if the item has at least two commas in it, split it if tables[table][key].count(',') > 0: out[fullkeyname] = list(map(lambda x: x.strip(), tables[table][key].split(","))) - print("\"" + keyname + "\":", "\"" + str(out[fullkeyname]) + "\",") + #print("\"" + keyname + "\":", "\"" + str(out[fullkeyname]) + "\",") #print("}") @@ -265,4 +338,4 @@ def flatten(tables): if __name__ == "__main__": - parse("test2.pdf", "cables/10GXS13", "10GXS13") \ No newline at end of file + parse("cables/3050/datasheet.pdf", "cables/3050", "3050", "Alphawire") \ No newline at end of file