Add basic label generator app, add return values to parsing

2024-03-15 20:31:37 -05:00
parent 5502a5069d
commit 992040e812
9 changed files with 470 additions and 46 deletions
--- a/read_datasheet.py
+++ b/read_datasheet.py
@@ -1,9 +1,8 @@
 #!/usr/bin/env python3

-# Parse Belden catalog techdata datasheets 
+# Parse Belden (100%) & Alphawire (75%) catalog techdata datasheets 

 import pandas as pd
-pd.set_option('future.no_silent_downcasting', True)
 from PyPDF2 import PdfReader
 import camelot
 import numpy as np
@@ -13,12 +12,25 @@ import json
 from util import fprint
 import uuid
 from util import run_cmd
+from util import win32
 import os
+import glob
+import sys

 def touch(path):
    with open(path, 'a'):
        os.utime(path, None)

+def find_data_file(filename):
+    if getattr(sys, "frozen", False):
+        # The application is frozen
+        datadir = os.path.dirname(sys.executable)
+    else:
+        # The application is not frozen
+        # Change this bit to match where you store your data files:
+        datadir = os.path.dirname(__file__)
+    return os.path.join(datadir, filename)
+
 def extract_table_name(table_start, searchpage, reader, dstype, fallbackname):
    if dstype == "Belden":
        ymin = table_start
@@ -41,12 +53,25 @@ def extract_table_name(table_start, searchpage, reader, dstype, fallbackname):
    #fprint(text_body)

 def parse(filename, output_dir, partnum, dstype):
-
+    tables = []
    # Extract table data
-    if dstype == "Belden":
-        tables = camelot.read_pdf(filename, pages="1-end", flavor='lattice', backend="poppler", split_text=False, line_scale=100, process_background=True, resolution=600, interations=1, layout_kwargs={'detect_vertical': False, 'char_margin': 0.5}, shift_text=['r', 't'])
-    elif dstype == "Alphawire":
-        tables = camelot.read_pdf(filename, pages="1-end", flavor='lattice', backend="poppler", split_text=False, line_scale=50, process_background=True, resolution=600, interations=1, layout_kwargs={'detect_vertical': True, 'char_margin': 0.5}, shift_text=['l', 't'])
+    try:
+        if dstype == "Belden":
+            tables = camelot.read_pdf(filename, pages="1-end", flavor='lattice', backend="ghostscript", split_text=False, line_scale=100, process_background=True, resolution=600, interations=1, layout_kwargs={'detect_vertical': False, 'char_margin': 0.5}, shift_text=['r', 't'])
+        elif dstype == "Alphawire":
+            tables = camelot.read_pdf(filename, pages="1-end", flavor='lattice', backend="ghostscript", split_text=False, line_scale=50, process_background=True, resolution=600, interations=1, layout_kwargs={'detect_vertical': True, 'char_margin': 0.5}, shift_text=['l', 't'])
+    except OSError as e: 
+        print(e)
+        if win32:
+            print("Ghostscript is not installed! Launching installer...")
+            #subprocess.run([r".\\gs10030w64.exe"])
+            os.system(r'''Powershell -Command "& { Start-Process \"''' + find_data_file("gs10030w64.exe") + r'''\" -Verb RunAs } " ''')
+            # Will return once file launched...
+            print("Once the install is completed, try again.")
+            return False
+        else:
+            print("Ghostscript is not installed. You can install it with e.g. apt install ghostscript for Debian-based systems.")
+            return False
    #fprint("Total tables extracted:", tables.n)
    n = 0
    #pagenum = 0
@@ -54,13 +79,14 @@ def parse(filename, output_dir, partnum, dstype):
    page = reader.pages[0]
    table_list = {}
    table_list_raw = {}
-
+    pd.set_option('future.no_silent_downcasting', True)
    for table in tables:
+        #with pd.options.context("future.no_silent_downcasting", True):
        table.df.infer_objects(copy=False)
-        table.df.replace('', np.nan, inplace=True)
+        table.df = table.df.replace('', np.nan).infer_objects(copy=False)
        table.df.dropna(inplace=True, how="all")
        table.df.dropna(inplace=True, axis="columns", how="all")
-        table.df.replace(np.nan, '', inplace=True)
+        table.df = table.df.replace(np.nan, '').infer_objects(copy=False)
        
        if not table.df.empty:
            #fprint("\nTable " + str(n))
@@ -281,7 +307,12 @@ def parse(filename, output_dir, partnum, dstype):

    #print(output_table)

-    run_cmd("rm \"" + output_dir + "\"/*.json") # not reliable!
+    #run_cmd("rm \"" + output_dir + "\"/*.json") # not reliable!
+    pattern = os.path.join(output_dir, '*.json')
+    json_files = glob.glob(pattern)
+    for file_path in json_files:
+        os.remove(file_path)
+        #print(f"Deleted {file_path}")
    with open(output_dir + "/search_" + output_table["searchspecs"]["id"] + ".json", 'w') as json_file:
        json.dump(output_table["searchspecs"], json_file)
    with open(output_dir + "/specs_" + output_table["partnum"] + ".json", 'w') as json_file:
@@ -289,7 +320,7 @@ def parse(filename, output_dir, partnum, dstype):

    #print(json.dumps(output_table, indent=2))
    touch(output_dir + "/parsed") # mark as parsed
-    return output_table
+    return True


 def flatten(tables):
@@ -338,4 +369,4 @@ def flatten(tables):
    

 if __name__ == "__main__":
-    parse("cables/3050/datasheet.pdf", "cables/3050", "3050", "Alphawire")
+    print(parse("cables/3050/datasheet.pdf", "cables/3050", "3050", "Alphawire"))