Store files, start overall spec collector code

This commit is contained in:
Cole Deck 2024-01-02 19:04:18 -06:00
parent e511a13262
commit 665503b1d6
4 changed files with 103 additions and 13 deletions

3
.gitignore vendored
View File

@ -1 +1,2 @@
venv
venv
__pycache__

85
get_specs.py Executable file
View File

@ -0,0 +1,85 @@
#!/usr/bin/env python3
import os
import sys
import read_datasheet
from alive_progress import alive_bar
import requests
bartext = ""
def try_download_datasheet(partnum):
global bartext
sanitized_name = partnum.replace(" ", "")
url = "https://catalog.belden.com/techdata/EN/" + sanitized_name + "_techdata.pdf"
print(url)
try:
with requests.get(url, stream=True) as r:
#r.raise_for_status()
if r.headers.get("Content-Type") != "application/pdf":
return False
if r.status_code == 404:
return False
os.mkdir(partnum)
with open(partnum + "/datasheet.pdf", 'wb') as f:
for chunk in r.iter_content(chunk_size=131072):
# If you have chunk encoded response uncomment if
# and set chunk_size parameter to None.
#if chunk:
bartext = bartext + "."
bar.text = bartext
f.write(chunk)
#print("")
return sanitized_name + ".pdf"
except KeyboardInterrupt:
print("Quitting!")
os.remove(partnum + "/datasheet.pdf")
sys.exit()
if __name__ == "__main__":
partnums = ["10GXS12", "RST 5L-RKT 5L-949",
"10GXS13",
"10GXW12",
"10GXW13",
"2412",
"2413",
"OSP6AU",
"FI4D024P9",
"FISD012R9",
"FDSD012A9",
"FSSL024NG",
"FISX006W0",
]
with alive_bar(len(partnums) * 2, dual_line=True, calibrate=30, bar="classic2", spinner="classic") as bar:
for partnum in partnums:
path = partnum + "/datasheet.pdf"
bartext = "Downloading datasheet for part " + partnum
bar.text = bartext
if os.path.exists(path) and os.path.getsize(path) > 1:
print("Using cached " + path, end='')
bar.text = "Using cached " + path
bar(skipped=True)
print("Parsing Datasheet contents of " + path, end='')
bar.text = "Parsing Datasheet contents of " + partnum + ".pdf..."
read_datasheet.parse(path, partnum)
bar(skipped=False)
elif try_download_datasheet(partnum) is not False:
print("Downloaded " + path, end='')
bar.text = "Downloaded " + path
bar(skipped=False)
print("Parsing Datasheet contents of " + path, end='')
bar.text = "Parsing Datasheet contents of " + partnum + ".pdf..."
read_datasheet.parse(path, partnum)
bar(skipped=False)
else:
print("Failed to download datasheet for part " + partnum, end='')
bar.text = "Failed to download datasheet for part " + partnum
bar(skipped=True)
bar(skipped=True)

View File

@ -8,12 +8,12 @@ import numpy as np
from PIL import Image
import io
def parse(filename):
def parse(filename, output_dir):
# Extract table data
tables = camelot.read_pdf(filename, pages="1-end", flavor='lattice', backend="poppler", split_text=False, line_scale=100, process_background=True, resolution=600, interations=1, layout_kwargs={'detect_vertical': False, 'char_margin': 0.5}, shift_text=['r', 't'])
print("Total tables extracted:", tables.n)
#print("Total tables extracted:", tables.n)
n = 0
pagenum = 0
reader = PdfReader(filename)
@ -53,12 +53,11 @@ def parse(filename):
#print(table.df)
#camelot.plot(table, kind='grid').savefig("test" + str(n) + ".png")
n=n+1
camelot.plot(tables[0], kind='grid').savefig("test.png")
#camelot.plot(tables[0], kind='grid').savefig("test.png")
#tables.export('foo.csv', f='csv')
tables.export(output_dir + '/techdata.csv', f='csv')
print(table_list)
print(table_list["Variants"])
# print(table_list)
# Extract Basic details - part name & description, image, etc
reader = PdfReader(filename)
@ -71,19 +70,20 @@ def parse(filename):
if Image.open(io.BytesIO(image_file_object.data)).mode == "P":
skip = True
continue
with open("brand.png", "wb") as fp:
with open(output_dir + "/brand.png", "wb") as fp:
fp.write(image_file_object.data)
if Image.open(io.BytesIO(image_file_object.data)).size == (430, 430):
with open("part.png", "wb") as fp:
with open(output_dir + "/part.png", "wb") as fp:
fp.write(image_file_object.data)
if skip:
for image_file_object in page.images:
if image_file_object.name == "img1.png":
with open("brand.png", "wb") as fp:
with open(output_dir + "/brand.png", "wb") as fp:
fp.write(image_file_object.data)
count += 1
return table_list
if __name__ == "__main__":
parse("test2.pdf")
parse("test2.pdf", "10GXS13")

View File

@ -1,5 +1,9 @@
# Runtime
camelot-py[base]
opencv-python
pypdf2==2.12.1
matplotlib
#PyQt5
alive-progress
requests
# Development
matplotlib