Store files, start overall spec collector code

2024-01-02 19:04:18 -06:00 · 2024-01-02 19:04:18 -06:00 · 665503b1d6
commit 665503b1d6
parent e511a13262
4 changed files with 103 additions and 13 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1 +1,2 @@
-venv
+venv
+__pycache__
--- a/get_specs.py
+++ b/get_specs.py
@ -0,0 +1,85 @@
+#!/usr/bin/env python3
+
+import os
+import sys
+import read_datasheet
+from alive_progress import alive_bar
+import requests
+
+
+bartext = ""
+
+def try_download_datasheet(partnum):
+    global bartext
+
+    sanitized_name = partnum.replace(" ", "")
+    url = "https://catalog.belden.com/techdata/EN/" + sanitized_name + "_techdata.pdf"
+    print(url)
+    try:
+        with requests.get(url, stream=True) as r:
+            #r.raise_for_status()
+            if r.headers.get("Content-Type") != "application/pdf":
+                return False
+            if r.status_code == 404:
+                return False
+            os.mkdir(partnum)
+            with open(partnum + "/datasheet.pdf", 'wb') as f:
+                for chunk in r.iter_content(chunk_size=131072): 
+                    # If you have chunk encoded response uncomment if
+                    # and set chunk_size parameter to None.
+                    #if chunk: 
+                    bartext = bartext + "."
+                    bar.text = bartext
+                    f.write(chunk)
+        #print("")
+        return sanitized_name + ".pdf"
+    except KeyboardInterrupt:
+        print("Quitting!")
+        os.remove(partnum + "/datasheet.pdf")
+        sys.exit()
+
+
+
+
+
+if __name__ == "__main__":
+    partnums = ["10GXS12", "RST 5L-RKT 5L-949", 
+"10GXS13",
+"10GXW12",
+"10GXW13",
+"2412",
+"2413",
+"OSP6AU",
+"FI4D024P9",
+"FISD012R9",
+"FDSD012A9",
+"FSSL024NG",
+"FISX006W0",
+    ]
+    with alive_bar(len(partnums) * 2, dual_line=True, calibrate=30, bar="classic2", spinner="classic") as bar:
+        for partnum in partnums:
+            path = partnum + "/datasheet.pdf"
+            bartext = "Downloading datasheet for part " + partnum
+            bar.text = bartext
+            if os.path.exists(path) and os.path.getsize(path) > 1:
+                print("Using cached " + path, end='')
+                bar.text = "Using cached " + path
+                bar(skipped=True)
+                print("Parsing Datasheet contents of " + path, end='')
+                bar.text = "Parsing Datasheet contents of " + partnum + ".pdf..."
+                read_datasheet.parse(path, partnum)
+                bar(skipped=False)
+            elif try_download_datasheet(partnum) is not False:
+                print("Downloaded " + path, end='')
+                bar.text = "Downloaded " + path
+                bar(skipped=False)
+                print("Parsing Datasheet contents of " + path, end='')
+                bar.text = "Parsing Datasheet contents of " + partnum + ".pdf..."
+                read_datasheet.parse(path, partnum)
+                bar(skipped=False)
+            else: 
+                print("Failed to download datasheet for part " + partnum, end='')
+                bar.text = "Failed to download datasheet for part " + partnum
+                bar(skipped=True)
+                bar(skipped=True)
+        
--- a/read_datasheet.py
+++ b/read_datasheet.py
@ -8,12 +8,12 @@ import numpy as np
 from PIL import Image
 import io

-def parse(filename):
+def parse(filename, output_dir):

    # Extract table data

    tables = camelot.read_pdf(filename, pages="1-end", flavor='lattice', backend="poppler", split_text=False, line_scale=100, process_background=True, resolution=600, interations=1, layout_kwargs={'detect_vertical': False, 'char_margin': 0.5}, shift_text=['r', 't'])
-    print("Total tables extracted:", tables.n)
+    #print("Total tables extracted:", tables.n)
    n = 0
    pagenum = 0
    reader = PdfReader(filename)
@ -53,12 +53,11 @@ def parse(filename):
            #print(table.df)
            #camelot.plot(table, kind='grid').savefig("test" + str(n) + ".png")
            n=n+1
-    camelot.plot(tables[0], kind='grid').savefig("test.png")
+    #camelot.plot(tables[0], kind='grid').savefig("test.png")
    
-    #tables.export('foo.csv', f='csv')
+    tables.export(output_dir + '/techdata.csv', f='csv')

-    print(table_list)
-    print(table_list["Variants"])
+    # print(table_list)
    # Extract Basic details - part name & description, image, etc

    reader = PdfReader(filename)
@ -71,19 +70,20 @@ def parse(filename):
            if Image.open(io.BytesIO(image_file_object.data)).mode == "P":
                skip = True
                continue
-            with open("brand.png", "wb") as fp:
+            with open(output_dir + "/brand.png", "wb") as fp:
                fp.write(image_file_object.data)
        if Image.open(io.BytesIO(image_file_object.data)).size == (430, 430):
-            with open("part.png", "wb") as fp:
+            with open(output_dir + "/part.png", "wb") as fp:
                fp.write(image_file_object.data)
    if skip:
        for image_file_object in page.images:
            if image_file_object.name == "img1.png":
-                with open("brand.png", "wb") as fp:
+                with open(output_dir + "/brand.png", "wb") as fp:
                    fp.write(image_file_object.data)
                    count += 1
+    return table_list

    

 if __name__ == "__main__":
-    parse("test2.pdf")
+    parse("test2.pdf", "10GXS13")
--- a/requirements.txt
+++ b/requirements.txt
@ -1,5 +1,9 @@
+# Runtime
 camelot-py[base]
 opencv-python
 pypdf2==2.12.1
-matplotlib
-#PyQt5
+alive-progress
+requests
+
+# Development
+matplotlib