#!/usr/bin/env python3 # Parse Belden catalog techdata datasheets from PyPDF2 import PdfReader import camelot import numpy as np from PIL import Image import io def parse(filename, output_dir): # Extract table data tables = camelot.read_pdf(filename, pages="1-end", flavor='lattice', backend="poppler", split_text=False, line_scale=100, process_background=True, resolution=600, interations=1, layout_kwargs={'detect_vertical': False, 'char_margin': 0.5}, shift_text=['r', 't']) #print("Total tables extracted:", tables.n) n = 0 pagenum = 0 reader = PdfReader(filename) page = reader.pages[0] table_list = {} for table in tables: table.df.replace('', np.nan, inplace=True) table.df.dropna(inplace=True, how="all") table.df.dropna(inplace=True, axis="columns", how="all") table.df.replace(np.nan, '', inplace=True) if not table.df.empty: #print("\nTable " + str(n)) # Extract table names table_start = table.cells[0][0].lt[1] # Read top-left cell's top-left coordinate #print(table_start) ymin = table_start ymax = table_start + 10 if pagenum != table.page - 1: pagenum = table.page - 1 page = reader.pages[table.page - 1] parts = [] def visitor_body(text, cm, tm, fontDict, fontSize): y = tm[5] if y > ymin and y < ymax: parts.append(text) page.extract_text(visitor_text=visitor_body) text_body = "".join(parts).strip('\n') #print(text_body) table_list[text_body] = table.df #table.to_html("table" + str(n) + ".html") #print(table.df) #camelot.plot(table, kind='grid').savefig("test" + str(n) + ".png") n=n+1 #camelot.plot(tables[0], kind='grid').savefig("test.png") tables.export(output_dir + '/techdata.json', f='json') # print(table_list) # Extract Basic details - part name & description, image, etc reader = PdfReader(filename) page = reader.pages[0] count = 0 skip = False for image_file_object in page.images: if image_file_object.name == "img0.png" and skip == False: #print(Image.open(io.BytesIO(image_file_object.data)).mode) if Image.open(io.BytesIO(image_file_object.data)).mode == "P": skip = True continue with open(output_dir + "/brand.png", "wb") as fp: fp.write(image_file_object.data) if Image.open(io.BytesIO(image_file_object.data)).size == (430, 430): with open(output_dir + "/part.png", "wb") as fp: fp.write(image_file_object.data) if skip: for image_file_object in page.images: if image_file_object.name == "img1.png": with open(output_dir + "/brand.png", "wb") as fp: fp.write(image_file_object.data) count += 1 return table_list if __name__ == "__main__": parse("test2.pdf", "10GXS13")