Extract table names & images

2024-01-02 18:13:14 -06:00
parent ec1d1be2a5
commit e511a13262
3 changed files with 66 additions and 5 deletions
--- a/read-datasheet.py
+++ b/read-datasheet.py
@@ -2,27 +2,88 @@

 # Parse Belden catalog techdata datasheets 

+from PyPDF2 import PdfReader
 import camelot
 import numpy as np
+from PIL import Image
+import io

 def parse(filename):
+
+    # Extract table data
+
    tables = camelot.read_pdf(filename, pages="1-end", flavor='lattice', backend="poppler", split_text=False, line_scale=100, process_background=True, resolution=600, interations=1, layout_kwargs={'detect_vertical': False, 'char_margin': 0.5}, shift_text=['r', 't'])
    print("Total tables extracted:", tables.n)
    n = 0
-    
+    pagenum = 0
+    reader = PdfReader(filename)
+    page = reader.pages[0]
+    table_list = {}
    for table in tables:
        table.df.replace('', np.nan, inplace=True)
        table.df.dropna(inplace=True, how="all")
        table.df.dropna(inplace=True, axis="columns", how="all")
        table.df.replace(np.nan, '', inplace=True)
+        
        if not table.df.empty:
+            #print("\nTable " + str(n))
+            # Extract table names
+            table_start = table.cells[0][0].lt[1] # Read top-left cell's top-left coordinate
+            #print(table_start)
+            ymin = table_start
+            ymax = table_start + 10
+            if pagenum != table.page - 1:
+                pagenum = table.page - 1
+                page = reader.pages[table.page - 1]
+            parts = []
+            def visitor_body(text, cm, tm, fontDict, fontSize):
+                y = tm[5]
+                if y > ymin and y < ymax:
+                    parts.append(text)
+
+            page.extract_text(visitor_text=visitor_body)
+            text_body = "".join(parts).strip('\n')
+            #print(text_body)
+
+
+
+            table_list[text_body] = table.df
            table.to_html("table" + str(n) + ".html")
-            print("\nTable " + str(n))
-            print(table.df)
+            
+            #print(table.df)
            #camelot.plot(table, kind='grid').savefig("test" + str(n) + ".png")
            n=n+1
-    camelot.plot(tables[0], kind='contour').savefig("test.png")
+    camelot.plot(tables[0], kind='grid').savefig("test.png")
+    
    #tables.export('foo.csv', f='csv')

+    print(table_list)
+    print(table_list["Variants"])
+    # Extract Basic details - part name & description, image, etc
+
+    reader = PdfReader(filename)
+    page = reader.pages[0]
+    count = 0
+    skip = False
+    for image_file_object in page.images:
+        if image_file_object.name == "img0.png" and skip == False:
+            #print(Image.open(io.BytesIO(image_file_object.data)).mode)
+            if Image.open(io.BytesIO(image_file_object.data)).mode == "P":
+                skip = True
+                continue
+            with open("brand.png", "wb") as fp:
+                fp.write(image_file_object.data)
+        if Image.open(io.BytesIO(image_file_object.data)).size == (430, 430):
+            with open("part.png", "wb") as fp:
+                fp.write(image_file_object.data)
+    if skip:
+        for image_file_object in page.images:
+            if image_file_object.name == "img1.png":
+                with open("brand.png", "wb") as fp:
+                    fp.write(image_file_object.data)
+                    count += 1
+
+    
+
 if __name__ == "__main__":
-    parse("test.pdf")
+    parse("test2.pdf")
--- a/test.pdf
+++ b/test.pdf
--- a/test2.pdf
+++ b/test2.pdf