Add basic label generator app, add return values to parsing
This commit is contained in:
@@ -1,9 +1,8 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
# Parse Belden catalog techdata datasheets
|
||||
# Parse Belden (100%) & Alphawire (75%) catalog techdata datasheets
|
||||
|
||||
import pandas as pd
|
||||
pd.set_option('future.no_silent_downcasting', True)
|
||||
from PyPDF2 import PdfReader
|
||||
import camelot
|
||||
import numpy as np
|
||||
@@ -13,12 +12,25 @@ import json
|
||||
from util import fprint
|
||||
import uuid
|
||||
from util import run_cmd
|
||||
from util import win32
|
||||
import os
|
||||
import glob
|
||||
import sys
|
||||
|
||||
def touch(path):
|
||||
with open(path, 'a'):
|
||||
os.utime(path, None)
|
||||
|
||||
def find_data_file(filename):
|
||||
if getattr(sys, "frozen", False):
|
||||
# The application is frozen
|
||||
datadir = os.path.dirname(sys.executable)
|
||||
else:
|
||||
# The application is not frozen
|
||||
# Change this bit to match where you store your data files:
|
||||
datadir = os.path.dirname(__file__)
|
||||
return os.path.join(datadir, filename)
|
||||
|
||||
def extract_table_name(table_start, searchpage, reader, dstype, fallbackname):
|
||||
if dstype == "Belden":
|
||||
ymin = table_start
|
||||
@@ -41,12 +53,25 @@ def extract_table_name(table_start, searchpage, reader, dstype, fallbackname):
|
||||
#fprint(text_body)
|
||||
|
||||
def parse(filename, output_dir, partnum, dstype):
|
||||
|
||||
tables = []
|
||||
# Extract table data
|
||||
if dstype == "Belden":
|
||||
tables = camelot.read_pdf(filename, pages="1-end", flavor='lattice', backend="poppler", split_text=False, line_scale=100, process_background=True, resolution=600, interations=1, layout_kwargs={'detect_vertical': False, 'char_margin': 0.5}, shift_text=['r', 't'])
|
||||
elif dstype == "Alphawire":
|
||||
tables = camelot.read_pdf(filename, pages="1-end", flavor='lattice', backend="poppler", split_text=False, line_scale=50, process_background=True, resolution=600, interations=1, layout_kwargs={'detect_vertical': True, 'char_margin': 0.5}, shift_text=['l', 't'])
|
||||
try:
|
||||
if dstype == "Belden":
|
||||
tables = camelot.read_pdf(filename, pages="1-end", flavor='lattice', backend="ghostscript", split_text=False, line_scale=100, process_background=True, resolution=600, interations=1, layout_kwargs={'detect_vertical': False, 'char_margin': 0.5}, shift_text=['r', 't'])
|
||||
elif dstype == "Alphawire":
|
||||
tables = camelot.read_pdf(filename, pages="1-end", flavor='lattice', backend="ghostscript", split_text=False, line_scale=50, process_background=True, resolution=600, interations=1, layout_kwargs={'detect_vertical': True, 'char_margin': 0.5}, shift_text=['l', 't'])
|
||||
except OSError as e:
|
||||
print(e)
|
||||
if win32:
|
||||
print("Ghostscript is not installed! Launching installer...")
|
||||
#subprocess.run([r".\\gs10030w64.exe"])
|
||||
os.system(r'''Powershell -Command "& { Start-Process \"''' + find_data_file("gs10030w64.exe") + r'''\" -Verb RunAs } " ''')
|
||||
# Will return once file launched...
|
||||
print("Once the install is completed, try again.")
|
||||
return False
|
||||
else:
|
||||
print("Ghostscript is not installed. You can install it with e.g. apt install ghostscript for Debian-based systems.")
|
||||
return False
|
||||
#fprint("Total tables extracted:", tables.n)
|
||||
n = 0
|
||||
#pagenum = 0
|
||||
@@ -54,13 +79,14 @@ def parse(filename, output_dir, partnum, dstype):
|
||||
page = reader.pages[0]
|
||||
table_list = {}
|
||||
table_list_raw = {}
|
||||
|
||||
pd.set_option('future.no_silent_downcasting', True)
|
||||
for table in tables:
|
||||
#with pd.options.context("future.no_silent_downcasting", True):
|
||||
table.df.infer_objects(copy=False)
|
||||
table.df.replace('', np.nan, inplace=True)
|
||||
table.df = table.df.replace('', np.nan).infer_objects(copy=False)
|
||||
table.df.dropna(inplace=True, how="all")
|
||||
table.df.dropna(inplace=True, axis="columns", how="all")
|
||||
table.df.replace(np.nan, '', inplace=True)
|
||||
table.df = table.df.replace(np.nan, '').infer_objects(copy=False)
|
||||
|
||||
if not table.df.empty:
|
||||
#fprint("\nTable " + str(n))
|
||||
@@ -281,7 +307,12 @@ def parse(filename, output_dir, partnum, dstype):
|
||||
|
||||
#print(output_table)
|
||||
|
||||
run_cmd("rm \"" + output_dir + "\"/*.json") # not reliable!
|
||||
#run_cmd("rm \"" + output_dir + "\"/*.json") # not reliable!
|
||||
pattern = os.path.join(output_dir, '*.json')
|
||||
json_files = glob.glob(pattern)
|
||||
for file_path in json_files:
|
||||
os.remove(file_path)
|
||||
#print(f"Deleted {file_path}")
|
||||
with open(output_dir + "/search_" + output_table["searchspecs"]["id"] + ".json", 'w') as json_file:
|
||||
json.dump(output_table["searchspecs"], json_file)
|
||||
with open(output_dir + "/specs_" + output_table["partnum"] + ".json", 'w') as json_file:
|
||||
@@ -289,7 +320,7 @@ def parse(filename, output_dir, partnum, dstype):
|
||||
|
||||
#print(json.dumps(output_table, indent=2))
|
||||
touch(output_dir + "/parsed") # mark as parsed
|
||||
return output_table
|
||||
return True
|
||||
|
||||
|
||||
def flatten(tables):
|
||||
@@ -338,4 +369,4 @@ def flatten(tables):
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parse("cables/3050/datasheet.pdf", "cables/3050", "3050", "Alphawire")
|
||||
print(parse("cables/3050/datasheet.pdf", "cables/3050", "3050", "Alphawire"))
|
||||
Reference in New Issue
Block a user