Create main runner app, with async multithreading
This commit is contained in:
@@ -8,13 +8,14 @@ import numpy as np
|
||||
from PIL import Image
|
||||
import io
|
||||
import json
|
||||
from util import fprint
|
||||
|
||||
def parse(filename, output_dir):
|
||||
|
||||
# Extract table data
|
||||
|
||||
tables = camelot.read_pdf(filename, pages="1-end", flavor='lattice', backend="poppler", split_text=False, line_scale=100, process_background=True, resolution=600, interations=1, layout_kwargs={'detect_vertical': False, 'char_margin': 0.5}, shift_text=['r', 't'])
|
||||
#print("Total tables extracted:", tables.n)
|
||||
#fprint("Total tables extracted:", tables.n)
|
||||
n = 0
|
||||
pagenum = 0
|
||||
reader = PdfReader(filename)
|
||||
@@ -27,10 +28,10 @@ def parse(filename, output_dir):
|
||||
table.df.replace(np.nan, '', inplace=True)
|
||||
|
||||
if not table.df.empty:
|
||||
#print("\nTable " + str(n))
|
||||
#fprint("\nTable " + str(n))
|
||||
# Extract table names
|
||||
table_start = table.cells[0][0].lt[1] # Read top-left cell's top-left coordinate
|
||||
#print(table_start)
|
||||
#fprint(table_start)
|
||||
ymin = table_start
|
||||
ymax = table_start + 10
|
||||
if pagenum != table.page - 1:
|
||||
@@ -46,20 +47,20 @@ def parse(filename, output_dir):
|
||||
text_body = "".join(parts).strip('\n')
|
||||
if len(text_body) == 0:
|
||||
text_body = str(n)
|
||||
#print(text_body)
|
||||
#fprint(text_body)
|
||||
|
||||
|
||||
table_list[text_body] = table.df
|
||||
#table.to_html("table" + str(n) + ".html")
|
||||
|
||||
#print(table.df)
|
||||
#fprint(table.df)
|
||||
#camelot.plot(table, kind='grid').savefig("test" + str(n) + ".png")
|
||||
n=n+1
|
||||
#camelot.plot(tables[0], kind='grid').savefig("test.png")
|
||||
|
||||
#tables.export(output_dir + '/techdata.json', f='json')
|
||||
|
||||
# print(table_list)
|
||||
# fprint(table_list)
|
||||
# Extract Basic details - part name & description, image, etc
|
||||
|
||||
reader = PdfReader(filename)
|
||||
@@ -68,7 +69,7 @@ def parse(filename, output_dir):
|
||||
skip = False
|
||||
for image_file_object in page.images:
|
||||
if image_file_object.name == "img0.png" and skip == False:
|
||||
#print(Image.open(io.BytesIO(image_file_object.data)).mode)
|
||||
#fprint(Image.open(io.BytesIO(image_file_object.data)).mode)
|
||||
if Image.open(io.BytesIO(image_file_object.data)).mode == "P":
|
||||
skip = True
|
||||
continue
|
||||
@@ -137,20 +138,20 @@ def parse(filename, output_dir):
|
||||
|
||||
# multi-page table check
|
||||
if table_name.isdigit() and len(tables) > 1:
|
||||
print(table_name)
|
||||
print(previous_table)
|
||||
fprint(table_name)
|
||||
fprint(previous_table)
|
||||
|
||||
|
||||
|
||||
|
||||
main_key = previous_table
|
||||
cont_key = table_name
|
||||
print(tables)
|
||||
fprint(tables)
|
||||
if vertical == False:
|
||||
main_keys = list(tables[main_key].keys())
|
||||
for i, (cont_key, cont_values) in enumerate(tables[cont_key].items()):
|
||||
if i < len(main_keys):
|
||||
print(tables[main_key][main_keys[i]])
|
||||
fprint(tables[main_key][main_keys[i]])
|
||||
tables[main_key][main_keys[i]] = (tables[main_key][main_keys[i]] + (cont_key,) + cont_values)
|
||||
|
||||
del tables[table_name]
|
||||
@@ -163,7 +164,7 @@ def parse(filename, output_dir):
|
||||
previous_table = table_name
|
||||
|
||||
|
||||
print(tables)
|
||||
fprint(tables)
|
||||
with open(output_dir + "/tables.json", 'w') as json_file:
|
||||
json.dump(tables, json_file)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user