Update Alphawire table parsing

2024-03-14 21:35:26 -05:00
parent 25ceb6c133
commit 39723ec442
2 changed files with 120 additions and 42 deletions
--- a/get_specs.py
+++ b/get_specs.py
@ -112,12 +112,14 @@ def query_search(partnum, source):
        r = requests.get(url=alphaurl)
        data = r.json()
        output = dict()
-        #print(data)
+        print(data["Results"])
        try:
            if data["Count"] > 0:
                #print(data["Results"][0]["Url"])
                for result in data["Results"]:
-                    if result["Url"].split("/")[-1] == partnum:
+                    #print(result["Url"].split("/")[-1], partnum.replace("-", "").replace("/", "_"))
                    if result["Url"].split("/")[-1] == partnum.replace("-", "").replace("/", "_"):
                        #print(partnum)
                        #print(result["Html"])
                        try:
@ -133,7 +135,7 @@ def query_search(partnum, source):
                        dsidx = result["Html"].index("<a href=\"/disteAPI/") + 9
                        dsidx2 = result["Html"].index(partnum, dsidx) + len(partnum)
                        output["datasheet"] = "https://www.alphawire.com" + result["Html"][dsidx:dsidx2]
-                        output["partnum"] = partnum
+                        output["partnum"] = partnum.replace("-", "").replace("/", "_")
                        #"test".index()
                        #print(output)
                        return output
@ -167,8 +169,9 @@ def get_multi(partnums, delay=0.25):
                    if r.status_code == 404:
                        return False
                    os.makedirs(output_dir, exist_ok=True)
                    bartext = ""
                    with open(output_dir + "/datasheet.pdf", 'wb') as f:
-                        for chunk in r.iter_content(chunk_size=131072): 
+                        for chunk in r.iter_content(chunk_size=65536): 
                            # If you have chunk encoded response uncomment if
                            # and set chunk_size parameter to None.
                            #if chunk: 
@ -195,8 +198,9 @@ def get_multi(partnums, delay=0.25):
                    if r.status_code == 404:
                        return False
                    os.makedirs(output_dir, exist_ok=True)
                    bartext = ""
                    with open(output_dir + "/datasheet.pdf", 'wb') as f:
-                        for chunk in r.iter_content(chunk_size=131072): 
+                        for chunk in r.iter_content(chunk_size=65536): 
                            # If you have chunk encoded response uncomment if
                            # and set chunk_size parameter to None.
                            #if chunk: 
@ -221,8 +225,9 @@ def get_multi(partnums, delay=0.25):
                    if r.status_code == 404:
                        return False
                    os.makedirs(output_dir, exist_ok=True)
                    bartext = ""
                    with open(output_dir + "/part-hires." + url.split(".")[-1], 'wb') as f:
-                        for chunk in r.iter_content(chunk_size=131072): 
+                        for chunk in r.iter_content(chunk_size=65536): 
                            # If you have chunk encoded response uncomment if
                            # and set chunk_size parameter to None.
                            #if chunk: 
@ -377,7 +382,7 @@ if __name__ == "__main__":
    "AW3050",
    "AW6714",
    "AW1172C",
-    "AW2211/4",
+    "AWFIT-221-1/4",
    "BLTF-1LF-006-RS5N",
    "BLTF-SD9-006-RI5N",
@ -421,7 +426,7 @@ if __name__ == "__main__":
    "BLC6D1100007"
    ]
-    #query_search("86104CY", "Alphawire")
+    #print(query_search("FIT-221-1/4", "Alphawire"))
    get_multi(partnums, 0.25)
    #query_search("10GXS13", "Belden")
--- a/read_datasheet.py
+++ b/read_datasheet.py
@ -19,17 +19,41 @@ def touch(path):
    with open(path, 'a'):
        os.utime(path, None)
 def extract_table_name(table_start, searchpage, reader, dstype, fallbackname):
    if dstype == "Belden":
        ymin = table_start
        ymax = table_start + 10
    elif dstype == "Alphawire":
        ymin = table_start - 5
        ymax = table_start + 10
    page = reader.pages[searchpage - 1]
    parts = []
    def visitor_body(text, cm, tm, fontDict, fontSize):
        y = tm[5]
        if y > ymin and y < ymax:
            parts.append(text)
    page.extract_text(visitor_text=visitor_body)
    text_body = "".join(parts).strip('\n')
    if len(text_body) == 0:
        text_body = str(fallbackname)
    return text_body
    #fprint(text_body)
 def parse(filename, output_dir, partnum, dstype):
    # Extract table data
-
+    if dstype == "Belden":
-    tables = camelot.read_pdf(filename, pages="1-end", flavor='lattice', backend="poppler", split_text=False, line_scale=100, process_background=True, resolution=600, interations=1, layout_kwargs={'detect_vertical': False, 'char_margin': 0.5}, shift_text=['r', 't'])
+        tables = camelot.read_pdf(filename, pages="1-end", flavor='lattice', backend="poppler", split_text=False, line_scale=100, process_background=True, resolution=600, interations=1, layout_kwargs={'detect_vertical': False, 'char_margin': 0.5}, shift_text=['r', 't'])
    elif dstype == "Alphawire":
        tables = camelot.read_pdf(filename, pages="1-end", flavor='lattice', backend="poppler", split_text=False, line_scale=50, process_background=True, resolution=600, interations=1, layout_kwargs={'detect_vertical': True, 'char_margin': 0.5}, shift_text=['l', 't'])
    #fprint("Total tables extracted:", tables.n)
    n = 0
-    pagenum = 0
+    #pagenum = 0
    reader = PdfReader(filename)
    page = reader.pages[0]
    table_list = {}
    table_list_raw = {}
    for table in tables:
        table.df.infer_objects(copy=False)
@ -43,25 +67,13 @@ def parse(filename, output_dir, partnum, dstype):
            # Extract table names
            table_start = table.cells[0][0].lt[1] # Read top-left cell's top-left coordinate
            #fprint(table_start)
            ymin = table_start
            ymax = table_start + 10
            if pagenum != table.page - 1:
                pagenum = table.page - 1
                page = reader.pages[table.page - 1]
            parts = []
            def visitor_body(text, cm, tm, fontDict, fontSize):
                y = tm[5]
                if y > ymin and y < ymax:
                    parts.append(text)
            page.extract_text(visitor_text=visitor_body)
            text_body = "".join(parts).strip('\n')
            if len(text_body) == 0:
                text_body = str(n)
            #fprint(text_body)
            text_body = extract_table_name(table_start, table.page, reader, dstype, n)
            table_list[text_body] = table.df
            if dstype == "Alphawire":
                table_list_raw[text_body] = table
            #table.to_html("table" + str(n) + ".html")
            #fprint(table.df)
@ -71,7 +83,7 @@ def parse(filename, output_dir, partnum, dstype):
    #tables.export(output_dir + '/techdata.json', f='json')
-    # fprint(table_list)
+    #fprint(table_list)
    # Extract Basic details - part name & description, image, etc
    reader = PdfReader(filename)
@ -100,24 +112,32 @@ def parse(filename, output_dir, partnum, dstype):
    tables = dict()
    torename = dict()
    previous_table = ""
    print(table_list.keys())
    for table_name in table_list.keys():
        # determine shape: horizontal or vertical
        table = table_list[table_name]
        rows = table.shape[0]
        cols = table.shape[1]
        vertical = None
        print(rows, cols, table_name)
        if rows > 2 and cols == 2:
            vertical = True
-        elif cols == 1:
+        elif cols == 1 and rows > 1:
            vertical = False
        elif rows == 1:
            vertical = True
        elif cols == 2: # and rows <= 2
            # inconsistent
-            if table.iloc[0, 0].find(":") == len(table.iloc[0, 0]) - 1: # check if last character is ":" indicating a vertical table
+            if dstype == "Belden":
-                vertical = True
+                if table.iloc[0, 0].find(":") == len(table.iloc[0, 0]) - 1: # check if last character is ":" indicating a vertical table
-            else:
+                    vertical = True
-                vertical = False
+                else:
                    vertical = False
            elif dstype == "Alphawire":
                if table.iloc[0, 0].find(")") == 1 or table.iloc[0, 0].find(")") == 2 or table.iloc[0, 0].find(":") == len(table.iloc[0, 0]) - 1: # check if last character is ":" indicating a vertical table
                    vertical = True
                else:
                    vertical = False
        elif cols > 2: # and rows <= 2
            vertical = False
@ -125,9 +145,12 @@ def parse(filename, output_dir, partnum, dstype):
            vertical = False
        else: # 1 column, <= 2 rows
            vertical = False
-
+        print(vertical)
        # missing name check
        for table_name_2 in table_list.keys(): 
            if dstype == "Alphawire" and table_name_2.find("\n") >= 0:
                torename[table_name_2] = table_name_2[0:table_name_2.find("\n")]
            if table_name_2.find(table.iloc[-1, 0]) >= 0:
                # Name taken from table directly above - this table does not have a name
                torename[table_name_2] = "Specs " + str(len(tables))
@ -136,8 +159,12 @@ def parse(filename, output_dir, partnum, dstype):
        if vertical:
            out = dict()
-            for row in table.itertuples(index=False, name=None):
+            if rows > 1:
-                out[row[0].replace("\n", " ").replace(":", "")] = row[1]
+                for row in table.itertuples(index=False, name=None):
                    out[row[0].replace("\n", " ").replace(":", "")] = row[1]
            else:
                for row in table.itertuples(index=False, name=None):
                    out[row[0].replace("\n", " ").replace(":", "")] = ""
        else: # horizontal
            out = dict()
@ -147,9 +174,52 @@ def parse(filename, output_dir, partnum, dstype):
        tables[table_name] = out
        # multi-page table check, Alphawire
        if dstype == "Alphawire" and table_name.isdigit():
            # table continues from previous page or has name on previous page
            thistbl = table_list_raw[table_name]
            prevtbl = table_list_raw[previous_table]
            if prevtbl.cells[-1][0].lb[1] < 50 and thistbl.cells[0][0].lt[1] > 600:
                # wraparound
                print("WRAP")
                print("PREV TABLE", prevtbl.df)
                print("THIS TABLE", thistbl.df)
                print("PREV TABLE CORNER", prevtbl.cells[-1][0].lb[1])
                print("THIS TABLE CORNER", thistbl.cells[0][0].lt[1])
                main_key = previous_table
                cont_key = table_name
                #print(vertical)
                if vertical == False:
                    main_keys = list(tables[main_key].keys())
                    for i, (cont_key, cont_values) in enumerate(tables[cont_key].items()):
                        if i < len(main_keys):
                            #print(tables[main_key][main_keys[i]])
                            tables[main_key][main_keys[i]] = (tuple(tables[main_key][main_keys[i]]) + (cont_key,) + cont_values)
                    del tables[table_name]
                else:
                    print(tables[cont_key].keys())
                    for key in tables[cont_key].keys():
                        print(main_key, key, cont_key, key)
                        tables[main_key][key] = tables[cont_key][key]
                    del tables[table_name]
            elif thistbl.cells[0][0].lt[1] > 600:
                # name on previous page (grrrr)
                print("NAMEABOVE")
                print("PREV TABLE", prevtbl.df)
                print("THIS TABLE", thistbl.df)
                print("PREV TABLE CORNER", prevtbl.cells[-1][0].lb[1])
                print("THIS TABLE CORNER", thistbl.cells[0][0].lt[1])
                name = extract_table_name(50, prevtbl.page,reader,dstype,table_name).strip("\n").strip()
                print("FOUND NAME:", name)
                torename[table_name] = name
-        # multi-page table check
+
        # multi-page table check, Belden
        if dstype == "Belden":
            if table_name.isdigit() and len(tables) > 1:
                #fprint(table_name)
@ -212,9 +282,13 @@ def parse(filename, output_dir, partnum, dstype):
    #print(output_table)
    run_cmd("rm \"" + output_dir + "\"/*.json") # not reliable!
-    with open(output_dir + "/" + output_table["searchspecs"]["id"] + ".json", 'w') as json_file:
+    with open(output_dir + "/search_" + output_table["searchspecs"]["id"] + ".json", 'w') as json_file:
        json.dump(output_table["searchspecs"], json_file)
-    touch(output_dir + "/parsed")
+    with open(output_dir + "/specs_" + output_table["partnum"] + ".json", 'w') as json_file:
        json.dump(output_table["fullspecs"], json_file)
    print(json.dumps(output_table, indent=2))
    touch(output_dir + "/parsed") # mark as parsed
    return output_table
@ -245,7 +319,6 @@ def flatten(tables):
                #print("\"" + keyname + "\":", "\"" + str(out[fullkeyname]) + "\",")
            elif len(tables[table][key]) == 1:
                out[fullkeyname] = convert_to_number(tables[table][key][0])
                #print("\"" + keyname + "\":", "\"" + str(out[fullkeyname]) + "\",")
            # if the item has at least two commas in it, split it
@ -256,7 +329,7 @@ def flatten(tables):
            # if the item has at least two commas in it, split it
            if tables[table][key].count(',') > 0:
                out[fullkeyname] = list(map(lambda x: x.strip(), tables[table][key].split(",")))
-                print("\"" + keyname + "\":", "\"" + str(out[fullkeyname]) + "\",")
+                #print("\"" + keyname + "\":", "\"" + str(out[fullkeyname]) + "\",")
    #print("}")
@ -265,4 +338,4 @@ def flatten(tables):
 if __name__ == "__main__":
-    parse("test2.pdf", "cables/10GXS13", "10GXS13")
+    parse("cables/3050/datasheet.pdf", "cables/3050", "3050", "Alphawire")