Add basic label generator app, add return values to parsing

This commit is contained in:
2024-03-15 20:31:37 -05:00
parent 5502a5069d
commit 992040e812
9 changed files with 470 additions and 46 deletions

View File

@@ -1,9 +1,8 @@
#!/usr/bin/env python3
# Parse Belden catalog techdata datasheets
# Parse Belden (100%) & Alphawire (75%) catalog techdata datasheets
import pandas as pd
pd.set_option('future.no_silent_downcasting', True)
from PyPDF2 import PdfReader
import camelot
import numpy as np
@@ -13,12 +12,25 @@ import json
from util import fprint
import uuid
from util import run_cmd
from util import win32
import os
import glob
import sys
def touch(path):
with open(path, 'a'):
os.utime(path, None)
def find_data_file(filename):
if getattr(sys, "frozen", False):
# The application is frozen
datadir = os.path.dirname(sys.executable)
else:
# The application is not frozen
# Change this bit to match where you store your data files:
datadir = os.path.dirname(__file__)
return os.path.join(datadir, filename)
def extract_table_name(table_start, searchpage, reader, dstype, fallbackname):
if dstype == "Belden":
ymin = table_start
@@ -41,12 +53,25 @@ def extract_table_name(table_start, searchpage, reader, dstype, fallbackname):
#fprint(text_body)
def parse(filename, output_dir, partnum, dstype):
tables = []
# Extract table data
if dstype == "Belden":
tables = camelot.read_pdf(filename, pages="1-end", flavor='lattice', backend="poppler", split_text=False, line_scale=100, process_background=True, resolution=600, interations=1, layout_kwargs={'detect_vertical': False, 'char_margin': 0.5}, shift_text=['r', 't'])
elif dstype == "Alphawire":
tables = camelot.read_pdf(filename, pages="1-end", flavor='lattice', backend="poppler", split_text=False, line_scale=50, process_background=True, resolution=600, interations=1, layout_kwargs={'detect_vertical': True, 'char_margin': 0.5}, shift_text=['l', 't'])
try:
if dstype == "Belden":
tables = camelot.read_pdf(filename, pages="1-end", flavor='lattice', backend="ghostscript", split_text=False, line_scale=100, process_background=True, resolution=600, interations=1, layout_kwargs={'detect_vertical': False, 'char_margin': 0.5}, shift_text=['r', 't'])
elif dstype == "Alphawire":
tables = camelot.read_pdf(filename, pages="1-end", flavor='lattice', backend="ghostscript", split_text=False, line_scale=50, process_background=True, resolution=600, interations=1, layout_kwargs={'detect_vertical': True, 'char_margin': 0.5}, shift_text=['l', 't'])
except OSError as e:
print(e)
if win32:
print("Ghostscript is not installed! Launching installer...")
#subprocess.run([r".\\gs10030w64.exe"])
os.system(r'''Powershell -Command "& { Start-Process \"''' + find_data_file("gs10030w64.exe") + r'''\" -Verb RunAs } " ''')
# Will return once file launched...
print("Once the install is completed, try again.")
return False
else:
print("Ghostscript is not installed. You can install it with e.g. apt install ghostscript for Debian-based systems.")
return False
#fprint("Total tables extracted:", tables.n)
n = 0
#pagenum = 0
@@ -54,13 +79,14 @@ def parse(filename, output_dir, partnum, dstype):
page = reader.pages[0]
table_list = {}
table_list_raw = {}
pd.set_option('future.no_silent_downcasting', True)
for table in tables:
#with pd.options.context("future.no_silent_downcasting", True):
table.df.infer_objects(copy=False)
table.df.replace('', np.nan, inplace=True)
table.df = table.df.replace('', np.nan).infer_objects(copy=False)
table.df.dropna(inplace=True, how="all")
table.df.dropna(inplace=True, axis="columns", how="all")
table.df.replace(np.nan, '', inplace=True)
table.df = table.df.replace(np.nan, '').infer_objects(copy=False)
if not table.df.empty:
#fprint("\nTable " + str(n))
@@ -281,7 +307,12 @@ def parse(filename, output_dir, partnum, dstype):
#print(output_table)
run_cmd("rm \"" + output_dir + "\"/*.json") # not reliable!
#run_cmd("rm \"" + output_dir + "\"/*.json") # not reliable!
pattern = os.path.join(output_dir, '*.json')
json_files = glob.glob(pattern)
for file_path in json_files:
os.remove(file_path)
#print(f"Deleted {file_path}")
with open(output_dir + "/search_" + output_table["searchspecs"]["id"] + ".json", 'w') as json_file:
json.dump(output_table["searchspecs"], json_file)
with open(output_dir + "/specs_" + output_table["partnum"] + ".json", 'w') as json_file:
@@ -289,7 +320,7 @@ def parse(filename, output_dir, partnum, dstype):
#print(json.dumps(output_table, indent=2))
touch(output_dir + "/parsed") # mark as parsed
return output_table
return True
def flatten(tables):
@@ -338,4 +369,4 @@ def flatten(tables):
if __name__ == "__main__":
parse("cables/3050/datasheet.pdf", "cables/3050", "3050", "Alphawire")
print(parse("cables/3050/datasheet.pdf", "cables/3050", "3050", "Alphawire"))