diff --git a/.gitignore b/.gitignore index f5e96db..d75edea 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ -venv \ No newline at end of file +venv +__pycache__ \ No newline at end of file diff --git a/get_specs.py b/get_specs.py new file mode 100755 index 0000000..f5362a8 --- /dev/null +++ b/get_specs.py @@ -0,0 +1,85 @@ +#!/usr/bin/env python3 + +import os +import sys +import read_datasheet +from alive_progress import alive_bar +import requests + + +bartext = "" + +def try_download_datasheet(partnum): + global bartext + + sanitized_name = partnum.replace(" ", "") + url = "https://catalog.belden.com/techdata/EN/" + sanitized_name + "_techdata.pdf" + print(url) + try: + with requests.get(url, stream=True) as r: + #r.raise_for_status() + if r.headers.get("Content-Type") != "application/pdf": + return False + if r.status_code == 404: + return False + os.mkdir(partnum) + with open(partnum + "/datasheet.pdf", 'wb') as f: + for chunk in r.iter_content(chunk_size=131072): + # If you have chunk encoded response uncomment if + # and set chunk_size parameter to None. + #if chunk: + bartext = bartext + "." + bar.text = bartext + f.write(chunk) + #print("") + return sanitized_name + ".pdf" + except KeyboardInterrupt: + print("Quitting!") + os.remove(partnum + "/datasheet.pdf") + sys.exit() + + + + + +if __name__ == "__main__": + partnums = ["10GXS12", "RST 5L-RKT 5L-949", +"10GXS13", +"10GXW12", +"10GXW13", +"2412", +"2413", +"OSP6AU", +"FI4D024P9", +"FISD012R9", +"FDSD012A9", +"FSSL024NG", +"FISX006W0", + ] + with alive_bar(len(partnums) * 2, dual_line=True, calibrate=30, bar="classic2", spinner="classic") as bar: + for partnum in partnums: + path = partnum + "/datasheet.pdf" + bartext = "Downloading datasheet for part " + partnum + bar.text = bartext + if os.path.exists(path) and os.path.getsize(path) > 1: + print("Using cached " + path, end='') + bar.text = "Using cached " + path + bar(skipped=True) + print("Parsing Datasheet contents of " + path, end='') + bar.text = "Parsing Datasheet contents of " + partnum + ".pdf..." + read_datasheet.parse(path, partnum) + bar(skipped=False) + elif try_download_datasheet(partnum) is not False: + print("Downloaded " + path, end='') + bar.text = "Downloaded " + path + bar(skipped=False) + print("Parsing Datasheet contents of " + path, end='') + bar.text = "Parsing Datasheet contents of " + partnum + ".pdf..." + read_datasheet.parse(path, partnum) + bar(skipped=False) + else: + print("Failed to download datasheet for part " + partnum, end='') + bar.text = "Failed to download datasheet for part " + partnum + bar(skipped=True) + bar(skipped=True) + \ No newline at end of file diff --git a/read-datasheet.py b/read_datasheet.py similarity index 85% rename from read-datasheet.py rename to read_datasheet.py index dfa329f..3e6063a 100755 --- a/read-datasheet.py +++ b/read_datasheet.py @@ -8,12 +8,12 @@ import numpy as np from PIL import Image import io -def parse(filename): +def parse(filename, output_dir): # Extract table data tables = camelot.read_pdf(filename, pages="1-end", flavor='lattice', backend="poppler", split_text=False, line_scale=100, process_background=True, resolution=600, interations=1, layout_kwargs={'detect_vertical': False, 'char_margin': 0.5}, shift_text=['r', 't']) - print("Total tables extracted:", tables.n) + #print("Total tables extracted:", tables.n) n = 0 pagenum = 0 reader = PdfReader(filename) @@ -53,12 +53,11 @@ def parse(filename): #print(table.df) #camelot.plot(table, kind='grid').savefig("test" + str(n) + ".png") n=n+1 - camelot.plot(tables[0], kind='grid').savefig("test.png") + #camelot.plot(tables[0], kind='grid').savefig("test.png") - #tables.export('foo.csv', f='csv') + tables.export(output_dir + '/techdata.csv', f='csv') - print(table_list) - print(table_list["Variants"]) + # print(table_list) # Extract Basic details - part name & description, image, etc reader = PdfReader(filename) @@ -71,19 +70,20 @@ def parse(filename): if Image.open(io.BytesIO(image_file_object.data)).mode == "P": skip = True continue - with open("brand.png", "wb") as fp: + with open(output_dir + "/brand.png", "wb") as fp: fp.write(image_file_object.data) if Image.open(io.BytesIO(image_file_object.data)).size == (430, 430): - with open("part.png", "wb") as fp: + with open(output_dir + "/part.png", "wb") as fp: fp.write(image_file_object.data) if skip: for image_file_object in page.images: if image_file_object.name == "img1.png": - with open("brand.png", "wb") as fp: + with open(output_dir + "/brand.png", "wb") as fp: fp.write(image_file_object.data) count += 1 + return table_list if __name__ == "__main__": - parse("test2.pdf") \ No newline at end of file + parse("test2.pdf", "10GXS13") \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 511e0ce..59c1021 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,9 @@ +# Runtime camelot-py[base] opencv-python pypdf2==2.12.1 -matplotlib -#PyQt5 \ No newline at end of file +alive-progress +requests + +# Development +matplotlib \ No newline at end of file