Store files, start overall spec collector code

2024-01-02 19:04:18 -06:00
parent e511a13262
commit 665503b1d6
4 changed files with 103 additions and 13 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,2 @@
-venv
+venv
 __pycache__
--- a/get_specs.py
+++ b/get_specs.py
@@ -0,0 +1,85 @@
 #!/usr/bin/env python3
 import os
 import sys
 import read_datasheet
 from alive_progress import alive_bar
 import requests
 bartext = ""
 def try_download_datasheet(partnum):
    global bartext
    sanitized_name = partnum.replace(" ", "")
    url = "https://catalog.belden.com/techdata/EN/" + sanitized_name + "_techdata.pdf"
    print(url)
    try:
        with requests.get(url, stream=True) as r:
            #r.raise_for_status()
            if r.headers.get("Content-Type") != "application/pdf":
                return False
            if r.status_code == 404:
                return False
            os.mkdir(partnum)
            with open(partnum + "/datasheet.pdf", 'wb') as f:
                for chunk in r.iter_content(chunk_size=131072): 
                    # If you have chunk encoded response uncomment if
                    # and set chunk_size parameter to None.
                    #if chunk: 
                    bartext = bartext + "."
                    bar.text = bartext
                    f.write(chunk)
        #print("")
        return sanitized_name + ".pdf"
    except KeyboardInterrupt:
        print("Quitting!")
        os.remove(partnum + "/datasheet.pdf")
        sys.exit()
 if __name__ == "__main__":
    partnums = ["10GXS12", "RST 5L-RKT 5L-949", 
 "10GXS13",
 "10GXW12",
 "10GXW13",
 "2412",
 "2413",
 "OSP6AU",
 "FI4D024P9",
 "FISD012R9",
 "FDSD012A9",
 "FSSL024NG",
 "FISX006W0",
    ]
    with alive_bar(len(partnums) * 2, dual_line=True, calibrate=30, bar="classic2", spinner="classic") as bar:
        for partnum in partnums:
            path = partnum + "/datasheet.pdf"
            bartext = "Downloading datasheet for part " + partnum
            bar.text = bartext
            if os.path.exists(path) and os.path.getsize(path) > 1:
                print("Using cached " + path, end='')
                bar.text = "Using cached " + path
                bar(skipped=True)
                print("Parsing Datasheet contents of " + path, end='')
                bar.text = "Parsing Datasheet contents of " + partnum + ".pdf..."
                read_datasheet.parse(path, partnum)
                bar(skipped=False)
            elif try_download_datasheet(partnum) is not False:
                print("Downloaded " + path, end='')
                bar.text = "Downloaded " + path
                bar(skipped=False)
                print("Parsing Datasheet contents of " + path, end='')
                bar.text = "Parsing Datasheet contents of " + partnum + ".pdf..."
                read_datasheet.parse(path, partnum)
                bar(skipped=False)
            else: 
                print("Failed to download datasheet for part " + partnum, end='')
                bar.text = "Failed to download datasheet for part " + partnum
                bar(skipped=True)
                bar(skipped=True)
--- a/read_datasheet.py
+++ b/read_datasheet.py
@@ -8,12 +8,12 @@ import numpy as np
 from PIL import Image
 import io
-def parse(filename):
+def parse(filename, output_dir):
    # Extract table data
    tables = camelot.read_pdf(filename, pages="1-end", flavor='lattice', backend="poppler", split_text=False, line_scale=100, process_background=True, resolution=600, interations=1, layout_kwargs={'detect_vertical': False, 'char_margin': 0.5}, shift_text=['r', 't'])
-    print("Total tables extracted:", tables.n)
+    #print("Total tables extracted:", tables.n)
    n = 0
    pagenum = 0
    reader = PdfReader(filename)
@@ -53,12 +53,11 @@ def parse(filename):
            #print(table.df)
            #camelot.plot(table, kind='grid').savefig("test" + str(n) + ".png")
            n=n+1
-    camelot.plot(tables[0], kind='grid').savefig("test.png")
+    #camelot.plot(tables[0], kind='grid').savefig("test.png")
-    #tables.export('foo.csv', f='csv')
+    tables.export(output_dir + '/techdata.csv', f='csv')
-    print(table_list)
+    # print(table_list)
    print(table_list["Variants"])
    # Extract Basic details - part name & description, image, etc
    reader = PdfReader(filename)
@@ -71,19 +70,20 @@ def parse(filename):
            if Image.open(io.BytesIO(image_file_object.data)).mode == "P":
                skip = True
                continue
-            with open("brand.png", "wb") as fp:
+            with open(output_dir + "/brand.png", "wb") as fp:
                fp.write(image_file_object.data)
        if Image.open(io.BytesIO(image_file_object.data)).size == (430, 430):
-            with open("part.png", "wb") as fp:
+            with open(output_dir + "/part.png", "wb") as fp:
                fp.write(image_file_object.data)
    if skip:
        for image_file_object in page.images:
            if image_file_object.name == "img1.png":
-                with open("brand.png", "wb") as fp:
+                with open(output_dir + "/brand.png", "wb") as fp:
                    fp.write(image_file_object.data)
                    count += 1
    return table_list
 if __name__ == "__main__":
-    parse("test2.pdf")
+    parse("test2.pdf", "10GXS13")
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,9 @@
 # Runtime
 camelot-py[base]
 opencv-python
 pypdf2==2.12.1
-matplotlib
+alive-progress
-#PyQt5
+requests
 # Development
 matplotlib