Store files, start overall spec collector code
This commit is contained in:
parent
e511a13262
commit
665503b1d6
3
.gitignore
vendored
3
.gitignore
vendored
@ -1 +1,2 @@
|
||||
venv
|
||||
venv
|
||||
__pycache__
|
85
get_specs.py
Executable file
85
get_specs.py
Executable file
@ -0,0 +1,85 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import os
|
||||
import sys
|
||||
import read_datasheet
|
||||
from alive_progress import alive_bar
|
||||
import requests
|
||||
|
||||
|
||||
bartext = ""
|
||||
|
||||
def try_download_datasheet(partnum):
|
||||
global bartext
|
||||
|
||||
sanitized_name = partnum.replace(" ", "")
|
||||
url = "https://catalog.belden.com/techdata/EN/" + sanitized_name + "_techdata.pdf"
|
||||
print(url)
|
||||
try:
|
||||
with requests.get(url, stream=True) as r:
|
||||
#r.raise_for_status()
|
||||
if r.headers.get("Content-Type") != "application/pdf":
|
||||
return False
|
||||
if r.status_code == 404:
|
||||
return False
|
||||
os.mkdir(partnum)
|
||||
with open(partnum + "/datasheet.pdf", 'wb') as f:
|
||||
for chunk in r.iter_content(chunk_size=131072):
|
||||
# If you have chunk encoded response uncomment if
|
||||
# and set chunk_size parameter to None.
|
||||
#if chunk:
|
||||
bartext = bartext + "."
|
||||
bar.text = bartext
|
||||
f.write(chunk)
|
||||
#print("")
|
||||
return sanitized_name + ".pdf"
|
||||
except KeyboardInterrupt:
|
||||
print("Quitting!")
|
||||
os.remove(partnum + "/datasheet.pdf")
|
||||
sys.exit()
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
partnums = ["10GXS12", "RST 5L-RKT 5L-949",
|
||||
"10GXS13",
|
||||
"10GXW12",
|
||||
"10GXW13",
|
||||
"2412",
|
||||
"2413",
|
||||
"OSP6AU",
|
||||
"FI4D024P9",
|
||||
"FISD012R9",
|
||||
"FDSD012A9",
|
||||
"FSSL024NG",
|
||||
"FISX006W0",
|
||||
]
|
||||
with alive_bar(len(partnums) * 2, dual_line=True, calibrate=30, bar="classic2", spinner="classic") as bar:
|
||||
for partnum in partnums:
|
||||
path = partnum + "/datasheet.pdf"
|
||||
bartext = "Downloading datasheet for part " + partnum
|
||||
bar.text = bartext
|
||||
if os.path.exists(path) and os.path.getsize(path) > 1:
|
||||
print("Using cached " + path, end='')
|
||||
bar.text = "Using cached " + path
|
||||
bar(skipped=True)
|
||||
print("Parsing Datasheet contents of " + path, end='')
|
||||
bar.text = "Parsing Datasheet contents of " + partnum + ".pdf..."
|
||||
read_datasheet.parse(path, partnum)
|
||||
bar(skipped=False)
|
||||
elif try_download_datasheet(partnum) is not False:
|
||||
print("Downloaded " + path, end='')
|
||||
bar.text = "Downloaded " + path
|
||||
bar(skipped=False)
|
||||
print("Parsing Datasheet contents of " + path, end='')
|
||||
bar.text = "Parsing Datasheet contents of " + partnum + ".pdf..."
|
||||
read_datasheet.parse(path, partnum)
|
||||
bar(skipped=False)
|
||||
else:
|
||||
print("Failed to download datasheet for part " + partnum, end='')
|
||||
bar.text = "Failed to download datasheet for part " + partnum
|
||||
bar(skipped=True)
|
||||
bar(skipped=True)
|
||||
|
@ -8,12 +8,12 @@ import numpy as np
|
||||
from PIL import Image
|
||||
import io
|
||||
|
||||
def parse(filename):
|
||||
def parse(filename, output_dir):
|
||||
|
||||
# Extract table data
|
||||
|
||||
tables = camelot.read_pdf(filename, pages="1-end", flavor='lattice', backend="poppler", split_text=False, line_scale=100, process_background=True, resolution=600, interations=1, layout_kwargs={'detect_vertical': False, 'char_margin': 0.5}, shift_text=['r', 't'])
|
||||
print("Total tables extracted:", tables.n)
|
||||
#print("Total tables extracted:", tables.n)
|
||||
n = 0
|
||||
pagenum = 0
|
||||
reader = PdfReader(filename)
|
||||
@ -53,12 +53,11 @@ def parse(filename):
|
||||
#print(table.df)
|
||||
#camelot.plot(table, kind='grid').savefig("test" + str(n) + ".png")
|
||||
n=n+1
|
||||
camelot.plot(tables[0], kind='grid').savefig("test.png")
|
||||
#camelot.plot(tables[0], kind='grid').savefig("test.png")
|
||||
|
||||
#tables.export('foo.csv', f='csv')
|
||||
tables.export(output_dir + '/techdata.csv', f='csv')
|
||||
|
||||
print(table_list)
|
||||
print(table_list["Variants"])
|
||||
# print(table_list)
|
||||
# Extract Basic details - part name & description, image, etc
|
||||
|
||||
reader = PdfReader(filename)
|
||||
@ -71,19 +70,20 @@ def parse(filename):
|
||||
if Image.open(io.BytesIO(image_file_object.data)).mode == "P":
|
||||
skip = True
|
||||
continue
|
||||
with open("brand.png", "wb") as fp:
|
||||
with open(output_dir + "/brand.png", "wb") as fp:
|
||||
fp.write(image_file_object.data)
|
||||
if Image.open(io.BytesIO(image_file_object.data)).size == (430, 430):
|
||||
with open("part.png", "wb") as fp:
|
||||
with open(output_dir + "/part.png", "wb") as fp:
|
||||
fp.write(image_file_object.data)
|
||||
if skip:
|
||||
for image_file_object in page.images:
|
||||
if image_file_object.name == "img1.png":
|
||||
with open("brand.png", "wb") as fp:
|
||||
with open(output_dir + "/brand.png", "wb") as fp:
|
||||
fp.write(image_file_object.data)
|
||||
count += 1
|
||||
return table_list
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parse("test2.pdf")
|
||||
parse("test2.pdf", "10GXS13")
|
@ -1,5 +1,9 @@
|
||||
# Runtime
|
||||
camelot-py[base]
|
||||
opencv-python
|
||||
pypdf2==2.12.1
|
||||
matplotlib
|
||||
#PyQt5
|
||||
alive-progress
|
||||
requests
|
||||
|
||||
# Development
|
||||
matplotlib
|
Loading…
x
Reference in New Issue
Block a user