jukebox-software/read_datasheet.py

#!/usr/bin/env python3

# Parse Belden catalog techdata datasheets

from PyPDF2 import PdfReader
import camelot
import numpy as np
from PIL import Image
import io

def parse(filename, output_dir):

    # Extract table data

    tables = camelot.read_pdf(filename, pages="1-end", flavor='lattice', backend="poppler", split_text=False, line_scale=100, process_background=True, resolution=600, interations=1, layout_kwargs={'detect_vertical': False, 'char_margin': 0.5}, shift_text=['r', 't'])
    #print("Total tables extracted:", tables.n)
    n = 0
    pagenum = 0
    reader = PdfReader(filename)
    page = reader.pages[0]
    table_list = {}
    for table in tables:
        table.df.replace('', np.nan, inplace=True)
        table.df.dropna(inplace=True, how="all")
        table.df.dropna(inplace=True, axis="columns", how="all")
        table.df.replace(np.nan, '', inplace=True)

        if not table.df.empty:
            #print("\nTable " + str(n))
            # Extract table names
            table_start = table.cells[0][0].lt[1] # Read top-left cell's top-left coordinate
            #print(table_start)
            ymin = table_start
            ymax = table_start + 10
            if pagenum != table.page - 1:
                pagenum = table.page - 1
                page = reader.pages[table.page - 1]
            parts = []
            def visitor_body(text, cm, tm, fontDict, fontSize):
                y = tm[5]
                if y > ymin and y < ymax:
                    parts.append(text)

            page.extract_text(visitor_text=visitor_body)
            text_body = "".join(parts).strip('\n')
            #print(text_body)


            table_list[text_body] = table.df
            table.to_html("table" + str(n) + ".html")

            #print(table.df)
            #camelot.plot(table, kind='grid').savefig("test" + str(n) + ".png")
            n=n+1
    #camelot.plot(tables[0], kind='grid').savefig("test.png")

    tables.export(output_dir + '/techdata.csv', f='csv')

    # print(table_list)
    # Extract Basic details - part name & description, image, etc

    reader = PdfReader(filename)
    page = reader.pages[0]
    count = 0
    skip = False
    for image_file_object in page.images:
        if image_file_object.name == "img0.png" and skip == False:
            #print(Image.open(io.BytesIO(image_file_object.data)).mode)
            if Image.open(io.BytesIO(image_file_object.data)).mode == "P":
                skip = True
                continue
            with open(output_dir + "/brand.png", "wb") as fp:
                fp.write(image_file_object.data)
        if Image.open(io.BytesIO(image_file_object.data)).size == (430, 430):
            with open(output_dir + "/part.png", "wb") as fp:
                fp.write(image_file_object.data)
    if skip:
        for image_file_object in page.images:
            if image_file_object.name == "img1.png":
                with open(output_dir + "/brand.png", "wb") as fp:
                    fp.write(image_file_object.data)
                    count += 1
    return table_list


if __name__ == "__main__":
    parse("test2.pdf", "10GXS13")