#!/usr/bin/env python3 # Parse Belden catalog techdata datasheets import pandas as pd pd.set_option('future.no_silent_downcasting', True) from PyPDF2 import PdfReader import camelot import numpy as np from PIL import Image import io import json from util import fprint import uuid from util import run_cmd import os def touch(path): with open(path, 'a'): os.utime(path, None) def parse(filename, output_dir, partnum, dstype): # Extract table data tables = camelot.read_pdf(filename, pages="1-end", flavor='lattice', backend="poppler", split_text=False, line_scale=100, process_background=True, resolution=600, interations=1, layout_kwargs={'detect_vertical': False, 'char_margin': 0.5}, shift_text=['r', 't']) #fprint("Total tables extracted:", tables.n) n = 0 pagenum = 0 reader = PdfReader(filename) page = reader.pages[0] table_list = {} for table in tables: table.df.infer_objects(copy=False) table.df.replace('', np.nan, inplace=True) table.df.dropna(inplace=True, how="all") table.df.dropna(inplace=True, axis="columns", how="all") table.df.replace(np.nan, '', inplace=True) if not table.df.empty: #fprint("\nTable " + str(n)) # Extract table names table_start = table.cells[0][0].lt[1] # Read top-left cell's top-left coordinate #fprint(table_start) ymin = table_start ymax = table_start + 10 if pagenum != table.page - 1: pagenum = table.page - 1 page = reader.pages[table.page - 1] parts = [] def visitor_body(text, cm, tm, fontDict, fontSize): y = tm[5] if y > ymin and y < ymax: parts.append(text) page.extract_text(visitor_text=visitor_body) text_body = "".join(parts).strip('\n') if len(text_body) == 0: text_body = str(n) #fprint(text_body) table_list[text_body] = table.df #table.to_html("table" + str(n) + ".html") #fprint(table.df) #camelot.plot(table, kind='grid').savefig("test" + str(n) + ".png") n=n+1 #camelot.plot(tables[0], kind='grid').savefig("test.png") #tables.export(output_dir + '/techdata.json', f='json') # fprint(table_list) # Extract Basic details - part name & description, image, etc reader = PdfReader(filename) page = reader.pages[0] count = 0 skip = False for image_file_object in page.images: if image_file_object.name == "img0.png" and skip == False: #fprint(Image.open(io.BytesIO(image_file_object.data)).mode) if Image.open(io.BytesIO(image_file_object.data)).mode == "P": skip = True continue with open(output_dir + "/brand.png", "wb") as fp: fp.write(image_file_object.data) if Image.open(io.BytesIO(image_file_object.data)).size == (430, 430): with open(output_dir + "/part.png", "wb") as fp: fp.write(image_file_object.data) if skip: for image_file_object in page.images: if image_file_object.name == "img1.png": with open(output_dir + "/brand.png", "wb") as fp: fp.write(image_file_object.data) count += 1 # Table parsing and reordring tables = dict() torename = dict() previous_table = "" for table_name in table_list.keys(): # determine shape: horizontal or vertical table = table_list[table_name] rows = table.shape[0] cols = table.shape[1] vertical = None if rows > 2 and cols == 2: vertical = True elif cols == 1: vertical = False elif rows == 1: vertical = True elif cols == 2: # and rows <= 2 # inconsistent if table.iloc[0, 0].find(":") == len(table.iloc[0, 0]) - 1: # check if last character is ":" indicating a vertical table vertical = True else: vertical = False elif cols > 2: # and rows <= 2 vertical = False elif rows > 2 and cols > 2: # big table vertical = False else: # 1 column, <= 2 rows vertical = False # missing name check for table_name_2 in table_list.keys(): if table_name_2.find(table.iloc[-1, 0]) >= 0: # Name taken from table directly above - this table does not have a name torename[table_name_2] = "Specs " + str(len(tables)) #table_list["Specs " + str(len(tables))] = table_list[table_name_2] # rename table to arbitrary altername name break if vertical: out = dict() for row in table.itertuples(index=False, name=None): out[row[0].replace("\n", " ").replace(":", "")] = row[1] else: # horizontal out = dict() for col in table.columns: col_data = tuple(table[col]) out[col_data[0].replace("\n", " ")] = col_data[1:] tables[table_name] = out # multi-page table check if dstype == "Belden": if table_name.isdigit() and len(tables) > 1: #fprint(table_name) #fprint(previous_table) main_key = previous_table cont_key = table_name #fprint(tables) if vertical == False: main_keys = list(tables[main_key].keys()) for i, (cont_key, cont_values) in enumerate(tables[cont_key].items()): if i < len(main_keys): #fprint(tables[main_key][main_keys[i]]) tables[main_key][main_keys[i]] = (tuple(tables[main_key][main_keys[i]]) + (cont_key,) + cont_values) del tables[table_name] else: for key in tables[cont_key].keys(): tables[main_key][key] = tables[cont_key][key] del tables[table_name] previous_table = table_name # remove renamed tables for table_name in torename.keys(): tables[torename[table_name]] = tables[table_name] del tables[table_name] # remove multi-line values that occasionally squeak through def replace_newlines_in_dict(d): for key, value in d.items(): if isinstance(value, str): # Replace \n with " " if the value is a string d[key] = value.replace('\n', ' ') elif isinstance(value, dict): # Recursively call the function if the value is another dictionary replace_newlines_in_dict(value) return d tables = replace_newlines_in_dict(tables) # summary output_table = dict() output_table["partnum"] = partnum id = str(uuid.uuid4()) output_table["id"] = id #output_table["position"] = id #output_table["brand"] = brand output_table["fullspecs"] = tables output_table["searchspecs"] = {"partnum": partnum, **flatten(tables)} output_table["searchspecs"]["id"] = id #print(output_table) run_cmd("rm \"" + output_dir + "\"/*.json") # not reliable! with open(output_dir + "/" + output_table["searchspecs"]["id"] + ".json", 'w') as json_file: json.dump(output_table["searchspecs"], json_file) touch(output_dir + "/parsed") return output_table def flatten(tables): def convert_to_number(s): try: # First, try converting to an integer. return int(s) except ValueError: # If that fails, try converting to a float. try: return float(s) except ValueError: # If it fails again, return the original string. return s out = dict() #print("{") for table in tables.keys(): for key in tables[table].keys(): if len(key) < 64: keyname = key else: keyname = key[0:64] fullkeyname = (table + ": " + keyname).replace(".","") if type(tables[table][key]) is not tuple: out[fullkeyname] = convert_to_number(tables[table][key]) #print("\"" + keyname + "\":", "\"" + str(out[fullkeyname]) + "\",") elif len(tables[table][key]) == 1: out[fullkeyname] = convert_to_number(tables[table][key][0]) #print("\"" + keyname + "\":", "\"" + str(out[fullkeyname]) + "\",") # if the item has at least two commas in it, split it if tables[table][key].count(',') > 0: out[fullkeyname] = list(map(lambda x: x.strip(), tables[table][key].split(","))) #print("\"" + keyname + "\":", "\"" + str(out[fullkeyname]) + "\",") # if the item has at least two commas in it, split it if tables[table][key].count(',') > 0: out[fullkeyname] = list(map(lambda x: x.strip(), tables[table][key].split(","))) print("\"" + keyname + "\":", "\"" + str(out[fullkeyname]) + "\",") #print("}") return out if __name__ == "__main__": parse("test2.pdf", "cables/10GXS13", "10GXS13")