#!/usr/bin/env python3 # Parse Belden catalog techdata datasheets import camelot import numpy as np def parse(filename): tables = camelot.read_pdf(filename, pages="1-end", flavor='lattice', backend="poppler", split_text=False, line_scale=100, process_background=True, resolution=600, interations=1, layout_kwargs={'detect_vertical': False, 'char_margin': 0.5}, shift_text=['r', 't']) print("Total tables extracted:", tables.n) n = 0 for table in tables: table.df.replace('', np.nan, inplace=True) table.df.dropna(inplace=True, how="all") table.df.dropna(inplace=True, axis="columns", how="all") table.df.replace(np.nan, '', inplace=True) if not table.df.empty: table.to_html("table" + str(n) + ".html") print("\nTable " + str(n)) print(table.df) #camelot.plot(table, kind='grid').savefig("test" + str(n) + ".png") n=n+1 camelot.plot(tables[0], kind='contour').savefig("test.png") #tables.export('foo.csv', f='csv') if __name__ == "__main__": parse("test.pdf")