Store files, start overall spec collector code
This commit is contained in:
parent
e511a13262
commit
665503b1d6
3
.gitignore
vendored
3
.gitignore
vendored
@ -1 +1,2 @@
|
|||||||
venv
|
venv
|
||||||
|
__pycache__
|
85
get_specs.py
Executable file
85
get_specs.py
Executable file
@ -0,0 +1,85 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import read_datasheet
|
||||||
|
from alive_progress import alive_bar
|
||||||
|
import requests
|
||||||
|
|
||||||
|
|
||||||
|
bartext = ""
|
||||||
|
|
||||||
|
def try_download_datasheet(partnum):
|
||||||
|
global bartext
|
||||||
|
|
||||||
|
sanitized_name = partnum.replace(" ", "")
|
||||||
|
url = "https://catalog.belden.com/techdata/EN/" + sanitized_name + "_techdata.pdf"
|
||||||
|
print(url)
|
||||||
|
try:
|
||||||
|
with requests.get(url, stream=True) as r:
|
||||||
|
#r.raise_for_status()
|
||||||
|
if r.headers.get("Content-Type") != "application/pdf":
|
||||||
|
return False
|
||||||
|
if r.status_code == 404:
|
||||||
|
return False
|
||||||
|
os.mkdir(partnum)
|
||||||
|
with open(partnum + "/datasheet.pdf", 'wb') as f:
|
||||||
|
for chunk in r.iter_content(chunk_size=131072):
|
||||||
|
# If you have chunk encoded response uncomment if
|
||||||
|
# and set chunk_size parameter to None.
|
||||||
|
#if chunk:
|
||||||
|
bartext = bartext + "."
|
||||||
|
bar.text = bartext
|
||||||
|
f.write(chunk)
|
||||||
|
#print("")
|
||||||
|
return sanitized_name + ".pdf"
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
print("Quitting!")
|
||||||
|
os.remove(partnum + "/datasheet.pdf")
|
||||||
|
sys.exit()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
partnums = ["10GXS12", "RST 5L-RKT 5L-949",
|
||||||
|
"10GXS13",
|
||||||
|
"10GXW12",
|
||||||
|
"10GXW13",
|
||||||
|
"2412",
|
||||||
|
"2413",
|
||||||
|
"OSP6AU",
|
||||||
|
"FI4D024P9",
|
||||||
|
"FISD012R9",
|
||||||
|
"FDSD012A9",
|
||||||
|
"FSSL024NG",
|
||||||
|
"FISX006W0",
|
||||||
|
]
|
||||||
|
with alive_bar(len(partnums) * 2, dual_line=True, calibrate=30, bar="classic2", spinner="classic") as bar:
|
||||||
|
for partnum in partnums:
|
||||||
|
path = partnum + "/datasheet.pdf"
|
||||||
|
bartext = "Downloading datasheet for part " + partnum
|
||||||
|
bar.text = bartext
|
||||||
|
if os.path.exists(path) and os.path.getsize(path) > 1:
|
||||||
|
print("Using cached " + path, end='')
|
||||||
|
bar.text = "Using cached " + path
|
||||||
|
bar(skipped=True)
|
||||||
|
print("Parsing Datasheet contents of " + path, end='')
|
||||||
|
bar.text = "Parsing Datasheet contents of " + partnum + ".pdf..."
|
||||||
|
read_datasheet.parse(path, partnum)
|
||||||
|
bar(skipped=False)
|
||||||
|
elif try_download_datasheet(partnum) is not False:
|
||||||
|
print("Downloaded " + path, end='')
|
||||||
|
bar.text = "Downloaded " + path
|
||||||
|
bar(skipped=False)
|
||||||
|
print("Parsing Datasheet contents of " + path, end='')
|
||||||
|
bar.text = "Parsing Datasheet contents of " + partnum + ".pdf..."
|
||||||
|
read_datasheet.parse(path, partnum)
|
||||||
|
bar(skipped=False)
|
||||||
|
else:
|
||||||
|
print("Failed to download datasheet for part " + partnum, end='')
|
||||||
|
bar.text = "Failed to download datasheet for part " + partnum
|
||||||
|
bar(skipped=True)
|
||||||
|
bar(skipped=True)
|
||||||
|
|
@ -8,12 +8,12 @@ import numpy as np
|
|||||||
from PIL import Image
|
from PIL import Image
|
||||||
import io
|
import io
|
||||||
|
|
||||||
def parse(filename):
|
def parse(filename, output_dir):
|
||||||
|
|
||||||
# Extract table data
|
# Extract table data
|
||||||
|
|
||||||
tables = camelot.read_pdf(filename, pages="1-end", flavor='lattice', backend="poppler", split_text=False, line_scale=100, process_background=True, resolution=600, interations=1, layout_kwargs={'detect_vertical': False, 'char_margin': 0.5}, shift_text=['r', 't'])
|
tables = camelot.read_pdf(filename, pages="1-end", flavor='lattice', backend="poppler", split_text=False, line_scale=100, process_background=True, resolution=600, interations=1, layout_kwargs={'detect_vertical': False, 'char_margin': 0.5}, shift_text=['r', 't'])
|
||||||
print("Total tables extracted:", tables.n)
|
#print("Total tables extracted:", tables.n)
|
||||||
n = 0
|
n = 0
|
||||||
pagenum = 0
|
pagenum = 0
|
||||||
reader = PdfReader(filename)
|
reader = PdfReader(filename)
|
||||||
@ -53,12 +53,11 @@ def parse(filename):
|
|||||||
#print(table.df)
|
#print(table.df)
|
||||||
#camelot.plot(table, kind='grid').savefig("test" + str(n) + ".png")
|
#camelot.plot(table, kind='grid').savefig("test" + str(n) + ".png")
|
||||||
n=n+1
|
n=n+1
|
||||||
camelot.plot(tables[0], kind='grid').savefig("test.png")
|
#camelot.plot(tables[0], kind='grid').savefig("test.png")
|
||||||
|
|
||||||
#tables.export('foo.csv', f='csv')
|
tables.export(output_dir + '/techdata.csv', f='csv')
|
||||||
|
|
||||||
print(table_list)
|
# print(table_list)
|
||||||
print(table_list["Variants"])
|
|
||||||
# Extract Basic details - part name & description, image, etc
|
# Extract Basic details - part name & description, image, etc
|
||||||
|
|
||||||
reader = PdfReader(filename)
|
reader = PdfReader(filename)
|
||||||
@ -71,19 +70,20 @@ def parse(filename):
|
|||||||
if Image.open(io.BytesIO(image_file_object.data)).mode == "P":
|
if Image.open(io.BytesIO(image_file_object.data)).mode == "P":
|
||||||
skip = True
|
skip = True
|
||||||
continue
|
continue
|
||||||
with open("brand.png", "wb") as fp:
|
with open(output_dir + "/brand.png", "wb") as fp:
|
||||||
fp.write(image_file_object.data)
|
fp.write(image_file_object.data)
|
||||||
if Image.open(io.BytesIO(image_file_object.data)).size == (430, 430):
|
if Image.open(io.BytesIO(image_file_object.data)).size == (430, 430):
|
||||||
with open("part.png", "wb") as fp:
|
with open(output_dir + "/part.png", "wb") as fp:
|
||||||
fp.write(image_file_object.data)
|
fp.write(image_file_object.data)
|
||||||
if skip:
|
if skip:
|
||||||
for image_file_object in page.images:
|
for image_file_object in page.images:
|
||||||
if image_file_object.name == "img1.png":
|
if image_file_object.name == "img1.png":
|
||||||
with open("brand.png", "wb") as fp:
|
with open(output_dir + "/brand.png", "wb") as fp:
|
||||||
fp.write(image_file_object.data)
|
fp.write(image_file_object.data)
|
||||||
count += 1
|
count += 1
|
||||||
|
return table_list
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
parse("test2.pdf")
|
parse("test2.pdf", "10GXS13")
|
@ -1,5 +1,9 @@
|
|||||||
|
# Runtime
|
||||||
camelot-py[base]
|
camelot-py[base]
|
||||||
opencv-python
|
opencv-python
|
||||||
pypdf2==2.12.1
|
pypdf2==2.12.1
|
||||||
matplotlib
|
alive-progress
|
||||||
#PyQt5
|
requests
|
||||||
|
|
||||||
|
# Development
|
||||||
|
matplotlib
|
Loading…
x
Reference in New Issue
Block a user