diff --git a/read-datasheet.py b/read-datasheet.py index 0a096b9..dfa329f 100755 --- a/read-datasheet.py +++ b/read-datasheet.py @@ -2,27 +2,88 @@ # Parse Belden catalog techdata datasheets +from PyPDF2 import PdfReader import camelot import numpy as np +from PIL import Image +import io def parse(filename): + + # Extract table data + tables = camelot.read_pdf(filename, pages="1-end", flavor='lattice', backend="poppler", split_text=False, line_scale=100, process_background=True, resolution=600, interations=1, layout_kwargs={'detect_vertical': False, 'char_margin': 0.5}, shift_text=['r', 't']) print("Total tables extracted:", tables.n) n = 0 - + pagenum = 0 + reader = PdfReader(filename) + page = reader.pages[0] + table_list = {} for table in tables: table.df.replace('', np.nan, inplace=True) table.df.dropna(inplace=True, how="all") table.df.dropna(inplace=True, axis="columns", how="all") table.df.replace(np.nan, '', inplace=True) + if not table.df.empty: + #print("\nTable " + str(n)) + # Extract table names + table_start = table.cells[0][0].lt[1] # Read top-left cell's top-left coordinate + #print(table_start) + ymin = table_start + ymax = table_start + 10 + if pagenum != table.page - 1: + pagenum = table.page - 1 + page = reader.pages[table.page - 1] + parts = [] + def visitor_body(text, cm, tm, fontDict, fontSize): + y = tm[5] + if y > ymin and y < ymax: + parts.append(text) + + page.extract_text(visitor_text=visitor_body) + text_body = "".join(parts).strip('\n') + #print(text_body) + + + + table_list[text_body] = table.df table.to_html("table" + str(n) + ".html") - print("\nTable " + str(n)) - print(table.df) + + #print(table.df) #camelot.plot(table, kind='grid').savefig("test" + str(n) + ".png") n=n+1 - camelot.plot(tables[0], kind='contour').savefig("test.png") + camelot.plot(tables[0], kind='grid').savefig("test.png") + #tables.export('foo.csv', f='csv') + print(table_list) + print(table_list["Variants"]) + # Extract Basic details - part name & description, image, etc + + reader = PdfReader(filename) + page = reader.pages[0] + count = 0 + skip = False + for image_file_object in page.images: + if image_file_object.name == "img0.png" and skip == False: + #print(Image.open(io.BytesIO(image_file_object.data)).mode) + if Image.open(io.BytesIO(image_file_object.data)).mode == "P": + skip = True + continue + with open("brand.png", "wb") as fp: + fp.write(image_file_object.data) + if Image.open(io.BytesIO(image_file_object.data)).size == (430, 430): + with open("part.png", "wb") as fp: + fp.write(image_file_object.data) + if skip: + for image_file_object in page.images: + if image_file_object.name == "img1.png": + with open("brand.png", "wb") as fp: + fp.write(image_file_object.data) + count += 1 + + + if __name__ == "__main__": - parse("test.pdf") \ No newline at end of file + parse("test2.pdf") \ No newline at end of file diff --git a/test.pdf b/test.pdf index 489e0e9..033532c 100644 Binary files a/test.pdf and b/test.pdf differ diff --git a/test2.pdf b/test2.pdf new file mode 100644 index 0000000..489e0e9 Binary files /dev/null and b/test2.pdf differ