Update Alphawire table parsing

2024-03-14 21:35:26 -05:00
parent 25ceb6c133
commit 39723ec442
2 changed files with 120 additions and 42 deletions
--- a/get_specs.py
+++ b/get_specs.py
@@ -112,12 +112,14 @@ def query_search(partnum, source):
        r = requests.get(url=alphaurl)
        data = r.json()
        output = dict()
-        #print(data)
+        print(data["Results"])
+        
        try:
            if data["Count"] > 0:
                #print(data["Results"][0]["Url"])
                for result in data["Results"]:
-                    if result["Url"].split("/")[-1] == partnum:
+                    #print(result["Url"].split("/")[-1], partnum.replace("-", "").replace("/", "_"))
+                    if result["Url"].split("/")[-1] == partnum.replace("-", "").replace("/", "_"):
                        #print(partnum)
                        #print(result["Html"])
                        try:
@@ -133,7 +135,7 @@ def query_search(partnum, source):
                        dsidx = result["Html"].index("<a href=\"/disteAPI/") + 9
                        dsidx2 = result["Html"].index(partnum, dsidx) + len(partnum)
                        output["datasheet"] = "https://www.alphawire.com" + result["Html"][dsidx:dsidx2]
-                        output["partnum"] = partnum
+                        output["partnum"] = partnum.replace("-", "").replace("/", "_")
                        #"test".index()
                        #print(output)
                        return output
@@ -167,8 +169,9 @@ def get_multi(partnums, delay=0.25):
                    if r.status_code == 404:
                        return False
                    os.makedirs(output_dir, exist_ok=True)
+                    bartext = ""
                    with open(output_dir + "/datasheet.pdf", 'wb') as f:
-                        for chunk in r.iter_content(chunk_size=131072): 
+                        for chunk in r.iter_content(chunk_size=65536): 
                            # If you have chunk encoded response uncomment if
                            # and set chunk_size parameter to None.
                            #if chunk: 
@@ -195,8 +198,9 @@ def get_multi(partnums, delay=0.25):
                    if r.status_code == 404:
                        return False
                    os.makedirs(output_dir, exist_ok=True)
+                    bartext = ""
                    with open(output_dir + "/datasheet.pdf", 'wb') as f:
-                        for chunk in r.iter_content(chunk_size=131072): 
+                        for chunk in r.iter_content(chunk_size=65536): 
                            # If you have chunk encoded response uncomment if
                            # and set chunk_size parameter to None.
                            #if chunk: 
@@ -221,8 +225,9 @@ def get_multi(partnums, delay=0.25):
                    if r.status_code == 404:
                        return False
                    os.makedirs(output_dir, exist_ok=True)
+                    bartext = ""
                    with open(output_dir + "/part-hires." + url.split(".")[-1], 'wb') as f:
-                        for chunk in r.iter_content(chunk_size=131072): 
+                        for chunk in r.iter_content(chunk_size=65536): 
                            # If you have chunk encoded response uncomment if
                            # and set chunk_size parameter to None.
                            #if chunk: 
@@ -377,7 +382,7 @@ if __name__ == "__main__":
    "AW3050",
    "AW6714",
    "AW1172C",
-    "AW2211/4",
+    "AWFIT-221-1/4",

    "BLTF-1LF-006-RS5N",
    "BLTF-SD9-006-RI5N",
@@ -421,7 +426,7 @@ if __name__ == "__main__":
    "BLC6D1100007"

    ]
-    #query_search("86104CY", "Alphawire")
+    #print(query_search("FIT-221-1/4", "Alphawire"))
    get_multi(partnums, 0.25)
    #query_search("10GXS13", "Belden")

--- a/read_datasheet.py
+++ b/read_datasheet.py
@@ -19,17 +19,41 @@ def touch(path):
    with open(path, 'a'):
        os.utime(path, None)

+def extract_table_name(table_start, searchpage, reader, dstype, fallbackname):
+    if dstype == "Belden":
+        ymin = table_start
+        ymax = table_start + 10
+    elif dstype == "Alphawire":
+        ymin = table_start - 5
+        ymax = table_start + 10
+    page = reader.pages[searchpage - 1]
+    parts = []
+    def visitor_body(text, cm, tm, fontDict, fontSize):
+        y = tm[5]
+        if y > ymin and y < ymax:
+            parts.append(text)
+    page.extract_text(visitor_text=visitor_body)
+    text_body = "".join(parts).strip('\n')
+    if len(text_body) == 0:
+        text_body = str(fallbackname)
+
+    return text_body
+    #fprint(text_body)
+
 def parse(filename, output_dir, partnum, dstype):

    # Extract table data
-
+    if dstype == "Belden":
        tables = camelot.read_pdf(filename, pages="1-end", flavor='lattice', backend="poppler", split_text=False, line_scale=100, process_background=True, resolution=600, interations=1, layout_kwargs={'detect_vertical': False, 'char_margin': 0.5}, shift_text=['r', 't'])
+    elif dstype == "Alphawire":
+        tables = camelot.read_pdf(filename, pages="1-end", flavor='lattice', backend="poppler", split_text=False, line_scale=50, process_background=True, resolution=600, interations=1, layout_kwargs={'detect_vertical': True, 'char_margin': 0.5}, shift_text=['l', 't'])
    #fprint("Total tables extracted:", tables.n)
    n = 0
-    pagenum = 0
+    #pagenum = 0
    reader = PdfReader(filename)
    page = reader.pages[0]
    table_list = {}
+    table_list_raw = {}

    for table in tables:
        table.df.infer_objects(copy=False)
@@ -43,25 +67,13 @@ def parse(filename, output_dir, partnum, dstype):
            # Extract table names
            table_start = table.cells[0][0].lt[1] # Read top-left cell's top-left coordinate
            #fprint(table_start)
-            ymin = table_start
-            ymax = table_start + 10
-            if pagenum != table.page - 1:
-                pagenum = table.page - 1
-                page = reader.pages[table.page - 1]
-            parts = []
-            def visitor_body(text, cm, tm, fontDict, fontSize):
-                y = tm[5]
-                if y > ymin and y < ymax:
-                    parts.append(text)
-
-            page.extract_text(visitor_text=visitor_body)
-            text_body = "".join(parts).strip('\n')
-            if len(text_body) == 0:
-                text_body = str(n)
-            #fprint(text_body)
            
+            text_body = extract_table_name(table_start, table.page, reader, dstype, n)

            table_list[text_body] = table.df
+            if dstype == "Alphawire":
+                table_list_raw[text_body] = table
+
            #table.to_html("table" + str(n) + ".html")
            
            #fprint(table.df)
@@ -100,24 +112,32 @@ def parse(filename, output_dir, partnum, dstype):
    tables = dict()
    torename = dict()
    previous_table = ""
+    print(table_list.keys())
    for table_name in table_list.keys():
        # determine shape: horizontal or vertical
        table = table_list[table_name]
        rows = table.shape[0]
        cols = table.shape[1]
        vertical = None
+        print(rows, cols, table_name)
        if rows > 2 and cols == 2:
            vertical = True
-        elif cols == 1:
+        elif cols == 1 and rows > 1:
            vertical = False
        elif rows == 1:
            vertical = True
        elif cols == 2: # and rows <= 2
            # inconsistent
+            if dstype == "Belden":
                if table.iloc[0, 0].find(":") == len(table.iloc[0, 0]) - 1: # check if last character is ":" indicating a vertical table
                    vertical = True
                else:
                    vertical = False
+            elif dstype == "Alphawire":
+                if table.iloc[0, 0].find(")") == 1 or table.iloc[0, 0].find(")") == 2 or table.iloc[0, 0].find(":") == len(table.iloc[0, 0]) - 1: # check if last character is ":" indicating a vertical table
+                    vertical = True
+                else:
+                    vertical = False

        elif cols > 2: # and rows <= 2
            vertical = False
@@ -125,9 +145,12 @@ def parse(filename, output_dir, partnum, dstype):
            vertical = False
        else: # 1 column, <= 2 rows
            vertical = False
-
+        print(vertical)
        # missing name check
        for table_name_2 in table_list.keys(): 
+            if dstype == "Alphawire" and table_name_2.find("\n") >= 0:
+                torename[table_name_2] = table_name_2[0:table_name_2.find("\n")]
+
            if table_name_2.find(table.iloc[-1, 0]) >= 0:
                # Name taken from table directly above - this table does not have a name
                torename[table_name_2] = "Specs " + str(len(tables))
@@ -136,8 +159,12 @@ def parse(filename, output_dir, partnum, dstype):

        if vertical:
            out = dict()
+            if rows > 1:
                for row in table.itertuples(index=False, name=None):
                    out[row[0].replace("\n", " ").replace(":", "")] = row[1]
+            else:
+                for row in table.itertuples(index=False, name=None):
+                    out[row[0].replace("\n", " ").replace(":", "")] = ""

        else: # horizontal
            out = dict()
@@ -147,9 +174,52 @@ def parse(filename, output_dir, partnum, dstype):
        
        tables[table_name] = out

+        # multi-page table check, Alphawire
+        if dstype == "Alphawire" and table_name.isdigit():
+            # table continues from previous page or has name on previous page
+            thistbl = table_list_raw[table_name]
+            prevtbl = table_list_raw[previous_table]
+            
+            if prevtbl.cells[-1][0].lb[1] < 50 and thistbl.cells[0][0].lt[1] > 600:
+                # wraparound
+                print("WRAP")
+                print("PREV TABLE", prevtbl.df)
+                print("THIS TABLE", thistbl.df)
+                print("PREV TABLE CORNER", prevtbl.cells[-1][0].lb[1])
+                print("THIS TABLE CORNER", thistbl.cells[0][0].lt[1])
+                main_key = previous_table
+                cont_key = table_name
+                #print(vertical)
+                if vertical == False:
+                    main_keys = list(tables[main_key].keys())
+                    for i, (cont_key, cont_values) in enumerate(tables[cont_key].items()):
+                        if i < len(main_keys):
+                            #print(tables[main_key][main_keys[i]])
+                            tables[main_key][main_keys[i]] = (tuple(tables[main_key][main_keys[i]]) + (cont_key,) + cont_values)
+    
+                    del tables[table_name]
+    
+                else:
+                    print(tables[cont_key].keys())
+                    for key in tables[cont_key].keys():
+                        print(main_key, key, cont_key, key)
+                        tables[main_key][key] = tables[cont_key][key]
+                    del tables[table_name]
+
+            elif thistbl.cells[0][0].lt[1] > 600:
+                # name on previous page (grrrr)
+                print("NAMEABOVE")
+                print("PREV TABLE", prevtbl.df)
+                print("THIS TABLE", thistbl.df)
+                print("PREV TABLE CORNER", prevtbl.cells[-1][0].lb[1])
+                print("THIS TABLE CORNER", thistbl.cells[0][0].lt[1])
+                name = extract_table_name(50, prevtbl.page,reader,dstype,table_name).strip("\n").strip()
+                print("FOUND NAME:", name)
+                torename[table_name] = name


-        # multi-page table check
+
+        # multi-page table check, Belden
        if dstype == "Belden":
            if table_name.isdigit() and len(tables) > 1:
                #fprint(table_name)
@@ -212,9 +282,13 @@ def parse(filename, output_dir, partnum, dstype):
    #print(output_table)

    run_cmd("rm \"" + output_dir + "\"/*.json") # not reliable!
-    with open(output_dir + "/" + output_table["searchspecs"]["id"] + ".json", 'w') as json_file:
+    with open(output_dir + "/search_" + output_table["searchspecs"]["id"] + ".json", 'w') as json_file:
        json.dump(output_table["searchspecs"], json_file)
-    touch(output_dir + "/parsed")
+    with open(output_dir + "/specs_" + output_table["partnum"] + ".json", 'w') as json_file:
+        json.dump(output_table["fullspecs"], json_file)
+
+    print(json.dumps(output_table, indent=2))
+    touch(output_dir + "/parsed") # mark as parsed
    return output_table


@@ -245,7 +319,6 @@ def flatten(tables):
                #print("\"" + keyname + "\":", "\"" + str(out[fullkeyname]) + "\",")
            elif len(tables[table][key]) == 1:
                out[fullkeyname] = convert_to_number(tables[table][key][0])
-                
                #print("\"" + keyname + "\":", "\"" + str(out[fullkeyname]) + "\",")

            # if the item has at least two commas in it, split it
@@ -256,7 +329,7 @@ def flatten(tables):
            # if the item has at least two commas in it, split it
            if tables[table][key].count(',') > 0:
                out[fullkeyname] = list(map(lambda x: x.strip(), tables[table][key].split(",")))
-                print("\"" + keyname + "\":", "\"" + str(out[fullkeyname]) + "\",")
+                #print("\"" + keyname + "\":", "\"" + str(out[fullkeyname]) + "\",")


    #print("}")
@@ -265,4 +338,4 @@ def flatten(tables):
    

 if __name__ == "__main__":
-    parse("test2.pdf", "cables/10GXS13", "10GXS13")
+    parse("cables/3050/datasheet.pdf", "cables/3050", "3050", "Alphawire")