Extract table names & images

This commit is contained in:
Cole Deck 2024-01-02 18:13:14 -06:00
parent ec1d1be2a5
commit e511a13262
3 changed files with 66 additions and 5 deletions

View File

@ -2,27 +2,88 @@
# Parse Belden catalog techdata datasheets
from PyPDF2 import PdfReader
import camelot
import numpy as np
from PIL import Image
import io
def parse(filename):
# Extract table data
tables = camelot.read_pdf(filename, pages="1-end", flavor='lattice', backend="poppler", split_text=False, line_scale=100, process_background=True, resolution=600, interations=1, layout_kwargs={'detect_vertical': False, 'char_margin': 0.5}, shift_text=['r', 't'])
print("Total tables extracted:", tables.n)
n = 0
pagenum = 0
reader = PdfReader(filename)
page = reader.pages[0]
table_list = {}
for table in tables:
table.df.replace('', np.nan, inplace=True)
table.df.dropna(inplace=True, how="all")
table.df.dropna(inplace=True, axis="columns", how="all")
table.df.replace(np.nan, '', inplace=True)
if not table.df.empty:
#print("\nTable " + str(n))
# Extract table names
table_start = table.cells[0][0].lt[1] # Read top-left cell's top-left coordinate
#print(table_start)
ymin = table_start
ymax = table_start + 10
if pagenum != table.page - 1:
pagenum = table.page - 1
page = reader.pages[table.page - 1]
parts = []
def visitor_body(text, cm, tm, fontDict, fontSize):
y = tm[5]
if y > ymin and y < ymax:
parts.append(text)
page.extract_text(visitor_text=visitor_body)
text_body = "".join(parts).strip('\n')
#print(text_body)
table_list[text_body] = table.df
table.to_html("table" + str(n) + ".html")
print("\nTable " + str(n))
print(table.df)
#print(table.df)
#camelot.plot(table, kind='grid').savefig("test" + str(n) + ".png")
n=n+1
camelot.plot(tables[0], kind='contour').savefig("test.png")
camelot.plot(tables[0], kind='grid').savefig("test.png")
#tables.export('foo.csv', f='csv')
print(table_list)
print(table_list["Variants"])
# Extract Basic details - part name & description, image, etc
reader = PdfReader(filename)
page = reader.pages[0]
count = 0
skip = False
for image_file_object in page.images:
if image_file_object.name == "img0.png" and skip == False:
#print(Image.open(io.BytesIO(image_file_object.data)).mode)
if Image.open(io.BytesIO(image_file_object.data)).mode == "P":
skip = True
continue
with open("brand.png", "wb") as fp:
fp.write(image_file_object.data)
if Image.open(io.BytesIO(image_file_object.data)).size == (430, 430):
with open("part.png", "wb") as fp:
fp.write(image_file_object.data)
if skip:
for image_file_object in page.images:
if image_file_object.name == "img1.png":
with open("brand.png", "wb") as fp:
fp.write(image_file_object.data)
count += 1
if __name__ == "__main__":
parse("test.pdf")
parse("test2.pdf")

BIN
test.pdf

Binary file not shown.

BIN
test2.pdf Normal file

Binary file not shown.