1
0
mirror of https://github.com/osmarks/meme-search-engine.git synced 2025-02-08 07:00:06 +00:00

Actually delete missing files

This commit is contained in:
osmarks 2024-01-02 14:12:26 +00:00
parent 4626f53bcb
commit e3ffc426b7

33
mse.py
View File

@ -122,6 +122,7 @@ class Index:
files[filename] = modtime files[filename] = modtime
await conn.commit() await conn.commit()
batch = [] batch = []
seen_files = set()
failed = set() failed = set()
for dirpath, _, filenames in os.walk(CONFIG["files"]): for dirpath, _, filenames in os.walk(CONFIG["files"]):
@ -131,6 +132,7 @@ class Index:
path = os.path.join(dirpath, file) path = os.path.join(dirpath, file)
file = os.path.relpath(path, CONFIG["files"]) file = os.path.relpath(path, CONFIG["files"])
st = os.stat(path) st = os.stat(path)
seen_files.add(file)
if st.st_mtime != files.get(file): if st.st_mtime != files.get(file):
paths.add(path) paths.add(path)
for task in asyncio.as_completed([ asyncio.get_running_loop().run_in_executor(executor, load_image, path, self.inference_server_config["image_size"]) for path in paths ]): for task in asyncio.as_completed([ asyncio.get_running_loop().run_in_executor(executor, load_image, path, self.inference_server_config["image_size"]) for path in paths ]):
@ -152,21 +154,7 @@ class Index:
print() print()
for failed_ in failed: for failed_ in failed:
print(failed_, "failed") print("Failed to load", failed_)
remove_indices = []
for index, filename in enumerate(self.associated_filenames):
if filename not in files or filename in modified:
remove_indices.append(index)
self.associated_filenames[index] = None
if filename not in files:
await conn.execute("DELETE FROM files WHERE filename = ?", (filename,))
await conn.commit()
# TODO concurrency
# TODO understand what that comment meant
if remove_indices:
self.faiss_index.remove_ids(numpy.array(remove_indices))
self.associated_filenames = [ x for x in self.associated_filenames if x is not None ]
filenames_set = set(self.associated_filenames) filenames_set = set(self.associated_filenames)
new_data = [] new_data = []
@ -181,6 +169,21 @@ class Index:
new_data = numpy.array(new_data) new_data = numpy.array(new_data)
self.associated_filenames.extend(new_filenames) self.associated_filenames.extend(new_filenames)
self.faiss_index.add(new_data) self.faiss_index.add(new_data)
remove_indices = []
for index, filename in enumerate(self.associated_filenames):
if filename not in seen_files or filename in modified:
remove_indices.append(index)
self.associated_filenames[index] = None
if filename not in seen_files:
await conn.execute("DELETE FROM files WHERE filename = ?", (filename,))
await conn.commit()
print("Deleting", len(remove_indices), "old entries")
# TODO concurrency
# TODO understand what that comment meant
if remove_indices:
self.faiss_index.remove_ids(numpy.array(remove_indices))
self.associated_filenames = [ x for x in self.associated_filenames if x is not None ]
finally: finally:
await conn.close() await conn.close()