diff --git a/.gitignore b/.gitignore index a6f5b85..9ffdf12 100644 --- a/.gitignore +++ b/.gitignore @@ -1,10 +1,18 @@ +# python venv __pycache__ +# cable data folder(s) cables +cables-sample.zip +# meilisearch (mainly where I've put the data volume for the container) +meili_data +# IDE things .vscode -output.log +.idea +# videos *.webm output.mp4 +# log files output.log -cables-sample.zip -*.png \ No newline at end of file +# images +*.png diff --git a/compose.yml b/compose.yml new file mode 100644 index 0000000..51434a5 --- /dev/null +++ b/compose.yml @@ -0,0 +1,13 @@ +services: + meilisearch: + image: "getmeili/meilisearch:v1.6.2" + ports: + - "7700:7700" + environment: + MEILI_MASTER_KEY: fluffybunnyrabbit + MEILI_NO_ANALYTICS: true + volumes: + - "meili_data:/meili_data" + +volumes: + meili_data: \ No newline at end of file diff --git a/database.py b/database.py deleted file mode 100644 index 2befec2..0000000 --- a/database.py +++ /dev/null @@ -1,140 +0,0 @@ -"""This module contains functionality for interacting with a PostgreSQL database. It will automatically handle error -conditions (i.e. missing columns) without terminating the entire program. Use the :py:class:`DBConnector` class to -handle database interactions, either as a standalone object or in a context manager.""" -from __future__ import annotations - -import os -import psycopg2 -from psycopg2 import DatabaseError, OperationalError -from psycopg2.errors import UndefinedColumn - -DB_ADDRESS = os.getenv('DB_ADDRESS', 'localhost') -DB_PORT = os.getenv('DB_PORT', 5432) -DB_USER = os.getenv('DB_USER', 'postgres') -DB_PASSWORD = os.getenv('DB_PASSWORD', '') -DB_NAME = os.getenv('DB_NAME', 'postgres') -DB_TABLE = os.getenv('DB_TABLE', 'cables') - - -class DBConnector: - """Context managed database class. Use with statements to automatically open and close the database connection, like - so: - - .. code-block:: python - with DBConnector() as db: - db.read() - """ - - def _db_start(self): - """Setup the database connection and cursor.""" - try: - self.conn = psycopg2.connect( - f"host={DB_ADDRESS} port={DB_PORT} dbname={DB_NAME} user={DB_USER} password={DB_PASSWORD}") - self.cur = self.conn.cursor() - except OperationalError as e: - raise e - - def _db_stop(self): - """Close the cursor and connection.""" - self.cur.close() - self.conn.close() - - def __init__(self): - self._db_start() - - def __del__(self): - self._db_stop() - - def __enter__(self): - self._db_start() - - def __exit__(self): - self._db_stop() - - def _get_cols(self) -> set[str]: - """Get the list of columns in the database. - - :return: A list of column names.""" - query = f"select COLUMN_NAME from information_schema.columns where table_name={DB_TABLE}" - rows = {x["COLUMN_NAME"] for x in self._query(query)} - return rows - - def _column_parity(self, columns: list[str] | set[str]) -> set[str]: - """If the listed columns are not in the database, add them. - - :param columns: The columns we expect are in the database. - :return: The list of columns in the database after querying.""" - cols = set(columns) - existing = self._get_cols() - needs = cols.difference(existing.intersection(cols)) - if len(needs) > 0: - query = f"ALTER TABLE {DB_TABLE} {', '.join([f'ADD COLUMN {c}' for c in needs])}" - self._query(query) - existing = self._get_cols() - return existing - - def _query(self, sql) -> list[dict]: - """Basic function for running queries. - - :param sql: SQL query as plaintext. - :return: Results of the query, or an empty list if none.""" - result = [] - try: - self.cur.execute(sql) - result = self._read_dict() - except DatabaseError as e: - print(f"ERROR {e.pgcode}: {e.pgerror}\n" - f"Caused by query: {sql}") - finally: - return result - - def _read_dict(self) -> list[dict]: - """Read the cursor as a list of dictionaries. psycopg2 defaults to using a list of tuples, so we want to convert - each row into a dictionary before we return it.""" - cols = [i.name for i in self.cur.description] - results = [] - for row in self.cur: - row_dict = {} - for i in range(0, len(row)): - if row[i]: - row_dict = {**row_dict, cols[i]: row[i]} - results.append(row_dict) - return results - - def read(self, **kwargs) -> list[dict]: - """Read rows from a database that match the specified filters. - - :param kwargs: Column constraints; i.e. what value to filter by in what column. - :returns: A list of dictionaries of all matching rows, or an empty list if no match.""" - args = [] - for kw in kwargs.keys(): - args.append(f"{kw} ILIKE {kwargs['kw']}") - query = f"SELECT * FROM {DB_TABLE}" - if len(args) > 0: - query += f" WHERE {' AND '.join(args)}" - return self._query(query) - - def write(self, **kwargs) -> dict: - """Write a row to the database. - - :param kwargs: Values to write for each database; specify each column separately! - :returns: The row you just added.""" - self._column_parity(set(kwargs.keys())) - values = [] - for val in kwargs.keys(): - values.append(kwargs[val]) - query = f"INSERT INTO {DB_TABLE} ({', '.join(kwargs.keys())}) VALUES ({', '.join(values)})" - self._query(query) - return kwargs - - def write_all(self, items: list[dict]) -> list[dict]: - """Write multiple rows to the database. - - :param items: Rows to write, as a list of dictionaries. - :returns: The rows that were added successfully.""" - successes = [] - for i in items: - res0 = self.write(**i) - if res0: - successes.append(res0) - return successes diff --git a/read_datasheet.py b/read_datasheet.py index 041856b..c928f1c 100755 --- a/read_datasheet.py +++ b/read_datasheet.py @@ -30,7 +30,7 @@ def parse(filename, output_dir, partnum, dstype): reader = PdfReader(filename) page = reader.pages[0] table_list = {} - + for table in tables: table.df.infer_objects(copy=False) table.df.replace('', np.nan, inplace=True) @@ -253,6 +253,11 @@ def flatten(tables): out[fullkeyname] = list(map(lambda x: x.strip(), tables[table][key].split(","))) #print("\"" + keyname + "\":", "\"" + str(out[fullkeyname]) + "\",") + # if the item has at least two commas in it, split it + if tables[table][key].count(',') > 0: + out[fullkeyname] = list(map(lambda x: x.strip(), tables[table][key].split(","))) + print("\"" + keyname + "\":", "\"" + str(out[fullkeyname]) + "\",") + #print("}") return out diff --git a/requirements.txt b/requirements.txt index bab6010..bbe8914 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,7 +5,7 @@ pypdf2==2.12.1 alive-progress requests git+https://github.com/Byeongdulee/python-urx.git -psycopg2-binary +meilisearch pyyaml Flask selenium diff --git a/search.py b/search.py new file mode 100644 index 0000000..6e6a720 --- /dev/null +++ b/search.py @@ -0,0 +1,117 @@ +"""Interactions with the Meilisearch API for adding and searching cables.""" +from meilisearch import Client +from meilisearch.task import TaskInfo +from meilisearch.errors import MeilisearchApiError +import json + +DEFAULT_URL = "http://localhost:7700" +DEFAULT_APIKEY = "fluffybunnyrabbit" # I WOULD RECOMMEND SOMETHING MORE SECURE +DEFAULT_INDEX = "cables" +DEFAULT_FILTERABLE_ATTRS = ["partnum", "uuid", "position"] # default filterable attributes + + +class JukeboxSearch: + """Class for interacting with the Meilisearch API.""" + def __init__(self, + url: str = None, + api_key: str = None, + index: str = None, + filterable_attrs: list = None): + """Connect to Meilisearch and perform first-run tasks as necessary. + + :param url: Address of the Meilisearch server. Defaults to ``http://localhost:7700`` if unspecified. + :param api_key: API key used to authenticate with Meilisearch. It is highly recommended to set this as something + secure if you can access this endpoint publicly, but you can ignore this and set Meilisearch's default API key + to ``fluffybunnyrabbit``. + :param index: The name of the index to configure. Defaults to ``cables`` if unspecified. + :param filterable_attrs: List of all the attributes we want to filter by.""" + # connect to Meilisearch + url = url or DEFAULT_URL + api_key = api_key or DEFAULT_APIKEY + filterable_attrs = filterable_attrs or DEFAULT_FILTERABLE_ATTRS + self.index = index or DEFAULT_INDEX + self.client = Client(url, api_key) + # create the index if it does not exist already + try: + self.client.get_index(self.index) + except MeilisearchApiError as _: + self.client.create_index(self.index) + # make a variable to easily reference the index + self.idxref = self.client.index(self.index) + + # update filterable attributes if needed + self.update_filterables(filterable_attrs) + + def add_document(self, document: dict) -> TaskInfo: + """Add a cable to the Meilisearch index. + + :param document: Dictionary containing all the cable data. + :returns: A TaskInfo object for the addition of the new document.""" + return self.idxref.add_documents(document) + + def add_documents(self, documents: list): + """Add a list of cables to the Meilisearch index. + + :param documents: List of dictionaries containing all the cable data. + :returns: A TaskInfo object for the last new document.""" + taskinfo = None + for i in documents: + taskinfo = self.add_document(i) + return taskinfo + + def update_filterables(self, filterables: list): + """Update filterable attributes and wait for database to fully index. If the filterable attributes matches the + current attributes in the database, don't update (saves reindexing). + + :param filterables: List of all filterable attributes""" + + existing_filterables = self.idxref.get_filterable_attributes() + if len(set(existing_filterables).difference(set(filterables))) > 0: + taskref = self.idxref.update_filterable_attributes(filterables) + + self.client.wait_for_task(taskref.index_uid) + + def search(self, query: str, filters: str = None): + """Execute a search query on the Meilisearch index. + + :param query: Seach query + :param filters: A meilisearch compatible filter statement. + :returns: The search results dict. Actual results are in a list under "hits", but there are other nice values that are useful in the root element.""" + if filters: + q = self.idxref.search(query, {"filter": filters}) + else: + q = self.idxref.search(query) + return q + + def _filter_one(self, filter: str): + """Get the first item to match a filter. + + :param filter: A meilisearch compatible filter statement. + :returns: A dict containing the results; If no results found, an empty dict.""" + q = self.search("", filter) + if q["estimatedTotalHits"] != 0: + return ["hits"][0] + else: + return dict() + + def get_position(self, position: str): + """Get a part by position. + + :param partnum: The position to search for.""" + return self._filter_one(f"position = {position}") + + def get_uuid(self, uuid: str): + """Get a specific UUID. + + :param uuid: The UUID to search for.""" + return self._filter_one(f"uuid = {uuid}") + + def get_partnum(self, partnum: str): + """Get a specific part number. + + :param partnum: The part number to search for.""" + return self._filter_one(f"partnum = {partnum}") + +# entrypoint +if __name__ == "__main__": + jbs = JukeboxSearch() \ No newline at end of file