From 74bd4bfc2f172c5355a43ea6cc5554caf006d7ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= Date: Fri, 22 May 2020 15:07:00 +0200 Subject: [PATCH 01/30] Add first version of ANN --- .Dockerignore | 1 + Dockerfile | 11 +++++ api.py | 102 +++++++++++++++++++++++++++++++++++++++++++++++ gunicorn.py | 3 ++ requirements.txt | 6 +++ settings.py | 10 +++++ utils.py | 90 +++++++++++++++++++++++++++++++++++++++++ 7 files changed, 223 insertions(+) create mode 100644 .Dockerignore create mode 100644 Dockerfile create mode 100644 api.py create mode 100644 gunicorn.py create mode 100644 requirements.txt create mode 100644 settings.py create mode 100644 utils.py diff --git a/.Dockerignore b/.Dockerignore new file mode 100644 index 0000000..6320cd2 --- /dev/null +++ b/.Dockerignore @@ -0,0 +1 @@ +data \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..5bde8ef --- /dev/null +++ b/Dockerfile @@ -0,0 +1,11 @@ +FROM python:3.7-slim + +WORKDIR /opt/ann + +COPY *.py /opt/ann/ +COPY requirements.txt /opt/ann/ + +RUN apt-get update && apt-get install --no-install-recommends -y build-essential && pip3 install -r /opt/ann/requirements.txt + +WORKDIR /opt/ann +ENTRYPOINT ["/usr/local/bin/gunicorn", "--config", "/opt/ann/gunicorn.py", "api:api"] diff --git a/api.py b/api.py new file mode 100644 index 0000000..a437754 --- /dev/null +++ b/api.py @@ -0,0 +1,102 @@ +import pathlib +from typing import List + +import annoy +import falcon +from falcon_cors import CORS +from falcon_multipart.middleware import MultipartMiddleware +import sentry_sdk +from sentry_sdk.integrations.falcon import FalconIntegration + +from utils import get_logger, text_file_iter +import settings + +logger = get_logger() + +sentry_sdk.init(dsn=settings.SENTRY_DSN, integrations=[FalconIntegration()]) + + +def load_index(file_path: pathlib.Path) -> annoy.AnnoyIndex: + index = annoy.AnnoyIndex(settings.INDEX_DIM, "euclidean") + index.load(str(file_path), prefault=True) + return index + + +def load_keys(file_path: pathlib.Path) -> List[int]: + return [int(x) for x in text_file_iter(file_path)] + + +INDEX = load_index(settings.INDEX_PATH) +KEYS = load_keys(settings.KEYS_PATH) +KEY_TO_ANN_ID = {x: i for i, x in enumerate(KEYS)} + + +class ANNResource: + def on_get(self, req: falcon.Request, resp: falcon.Response, logo_id: int): + if logo_id not in KEY_TO_ANN_ID: + resp.status = falcon.HTTP_404 + return + + count = req.get_param_as_int("count", min_value=1, max_value=500, default=100) + item_index = KEY_TO_ANN_ID[logo_id] + + indexes, distances = INDEX.get_nns_by_item( + item_index, count + 1, include_distances=True + ) + + if indexes.pop(0) != item_index: + logger.warning("Most similar logo is not provided logo") + + distances.pop(0) + + logo_ids = [KEYS[index] for index in indexes] + results = [] + + for ann_logo_id, distance in zip(logo_ids, distances): + results.append({"distance": distance, "logo_id": ann_logo_id}) + + resp.media = {"results": results, "count": len(results)} + + +class ANNEmbeddingResource: + def on_post(self, req: falcon.Request, resp: falcon.Response): + count = req.media.get("count", 1) + embedding = req.media["embedding"] + + if len(embedding) != settings.INDEX_DIM: + raise falcon.HTTPBadRequest( + "invalid dimension", + "embedding must be of size {}, here: {}".format( + settings.INDEX_DIM, len(embedding) + ), + ) + + indexes, distances = INDEX.get_nns_by_vector( + embedding, count, include_distances=True + ) + + logo_ids = [KEYS[index] for index in indexes] + results = [] + + for ann_logo_id, distance in zip(logo_ids, distances): + results.append({"distance": distance, "logo_id": ann_logo_id}) + + resp.media = {"results": results, "count": len(results)} + + +cors = CORS( + allow_all_origins=True, + allow_all_headers=True, + allow_all_methods=True, + allow_credentials_all_origins=True, + max_age=600, +) + +api = falcon.API(middleware=[cors.middleware, MultipartMiddleware()]) + +# Parse form parameters +api.req_options.auto_parse_form_urlencoded = True +api.req_options.strip_url_path_trailing_slash = True +api.req_options.auto_parse_qs_csv = True +api.add_route("/api/v1/ann/{logo_id:int}", ANNResource()) +api.add_route("/api/v1/ann", ANNEmbeddingResource()) diff --git a/gunicorn.py b/gunicorn.py new file mode 100644 index 0000000..a2f2287 --- /dev/null +++ b/gunicorn.py @@ -0,0 +1,3 @@ +bind = ":5501" +workers = 1 +timeout = 60 diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..854950c --- /dev/null +++ b/requirements.txt @@ -0,0 +1,6 @@ +annoy==1.16.3 +gunicorn==20.0.4 +falcon==2.0.0 +falcon-cors==1.1.7 +falcon-multipart==0.2.0 +sentry-sdk[falcon]==0.14.4 \ No newline at end of file diff --git a/settings.py b/settings.py new file mode 100644 index 0000000..6e6b7e1 --- /dev/null +++ b/settings.py @@ -0,0 +1,10 @@ +import os +import pathlib + +PROJECT_DIR = pathlib.Path(__file__).parent +DATA_DIR = PROJECT_DIR / "data" + +SENTRY_DSN = os.environ.get("SENTRY_DSN") +INDEX_DIM = 1280 +INDEX_PATH = DATA_DIR / "index.bin" +KEYS_PATH = DATA_DIR / "index.txt" diff --git a/utils.py b/utils.py new file mode 100644 index 0000000..3a2b30b --- /dev/null +++ b/utils.py @@ -0,0 +1,90 @@ +import gzip +import json +import logging +import os +import pathlib + +import sys +from typing import Callable, Union, Iterable, Dict + + +def get_logger(name=None, level: str = "INFO"): + logger = logging.getLogger(name) + logger.setLevel(level) + + if name is None: + configure_root_logger(logger, level) + + return logger + + +def configure_root_logger(logger, level: str = "INFO"): + log_level = os.environ.get("LOG_LEVEL", "INFO").upper() + + if log_level not in ("DEBUG", "INFO", "WARNING", "ERROR", "FATAL", "CRITICAL"): + print( + "Unknown log level: {}, fallback " "to INFO".format(log_level), + file=sys.stderr, + ) + log_level = level + + logger.setLevel(log_level) + handler = logging.StreamHandler() + formatter = logging.Formatter( + "%(asctime)s :: %(processName)s :: " + "%(threadName)s :: %(levelname)s :: " + "%(message)s" + ) + handler.setFormatter(formatter) + handler.setLevel(log_level) + logger.addHandler(handler) + + +def jsonl_iter(jsonl_path: Union[str, pathlib.Path]) -> Iterable[Dict]: + open_fn = get_open_fn(jsonl_path) + + with open_fn(str(jsonl_path), "rt", encoding="utf-8") as f: + yield from jsonl_iter_fp(f) + + +def gzip_jsonl_iter(jsonl_path: Union[str, pathlib.Path]) -> Iterable[Dict]: + with gzip.open(jsonl_path, "rt", encoding="utf-8") as f: + yield from jsonl_iter_fp(f) + + +def jsonl_iter_fp(fp) -> Iterable[Dict]: + for line in fp: + line = line.strip("\n") + if line: + yield json.loads(line) + + +def dump_jsonl(filepath: Union[str, pathlib.Path], json_iter: Iterable[Dict]) -> int: + count = 0 + open_fn = get_open_fn(filepath) + + with open_fn(str(filepath), "wt") as f: + for item in json_iter: + f.write(json.dumps(item) + "\n") + count += 1 + + return count + + +def get_open_fn(filepath: Union[str, pathlib.Path]) -> Callable: + filepath = str(filepath) + if filepath.endswith(".gz"): + return gzip.open + else: + return open + + +def text_file_iter(filepath: Union[str, pathlib.Path]) -> Iterable[str]: + open_fn = get_open_fn(filepath) + + with open_fn(str(filepath), "rt") as f: + for item in f: + item = item.strip("\n") + + if item: + yield item From 4671c58b243a6681d0132690d6e52abad1e96009 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= Date: Sat, 23 May 2020 09:47:14 +0200 Subject: [PATCH 02/30] Add new endpoint to ANN API --- api.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/api.py b/api.py index a437754..48064d0 100644 --- a/api.py +++ b/api.py @@ -1,5 +1,6 @@ +import random import pathlib -from typing import List +from typing import List, Optional import annoy import falcon @@ -32,8 +33,13 @@ def load_keys(file_path: pathlib.Path) -> List[int]: class ANNResource: - def on_get(self, req: falcon.Request, resp: falcon.Response, logo_id: int): - if logo_id not in KEY_TO_ANN_ID: + def on_get( + self, req: falcon.Request, resp: falcon.Response, logo_id: Optional[int] = None + ): + if logo_id is None: + logo_id = KEYS[random.randint(0, len(KEYS) - 1)] + + elif logo_id not in KEY_TO_ANN_ID: resp.status = falcon.HTTP_404 return @@ -99,4 +105,5 @@ def on_post(self, req: falcon.Request, resp: falcon.Response): api.req_options.strip_url_path_trailing_slash = True api.req_options.auto_parse_qs_csv = True api.add_route("/api/v1/ann/{logo_id:int}", ANNResource()) +api.add_route("/api/v1/ann/random", ANNResource()) api.add_route("/api/v1/ann", ANNEmbeddingResource()) From 730993ea7bfacf11e0cd9703f17f26d16dc0ca78 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= Date: Sat, 23 May 2020 10:17:44 +0200 Subject: [PATCH 03/30] Don't remove first element in ann results --- api.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/api.py b/api.py index 48064d0..2de785d 100644 --- a/api.py +++ b/api.py @@ -47,14 +47,9 @@ def on_get( item_index = KEY_TO_ANN_ID[logo_id] indexes, distances = INDEX.get_nns_by_item( - item_index, count + 1, include_distances=True + item_index, count, include_distances=True ) - if indexes.pop(0) != item_index: - logger.warning("Most similar logo is not provided logo") - - distances.pop(0) - logo_ids = [KEYS[index] for index in indexes] results = [] From 736585239c98ce497115bafeb91393c1b5ed3353 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= Date: Wed, 27 May 2020 12:54:38 +0200 Subject: [PATCH 04/30] Allow ANN service to load multiple indexes --- api.py | 51 ++++++++++++++++++++++++++++++++++++++++----------- settings.py | 7 +++++-- 2 files changed, 45 insertions(+), 13 deletions(-) diff --git a/api.py b/api.py index 2de785d..b13bec8 100644 --- a/api.py +++ b/api.py @@ -1,6 +1,6 @@ import random import pathlib -from typing import List, Optional +from typing import Dict, List, Optional import annoy import falcon @@ -27,30 +27,52 @@ def load_keys(file_path: pathlib.Path) -> List[int]: return [int(x) for x in text_file_iter(file_path)] -INDEX = load_index(settings.INDEX_PATH) -KEYS = load_keys(settings.KEYS_PATH) -KEY_TO_ANN_ID = {x: i for i, x in enumerate(KEYS)} +class ANNIndex: + def __init__(self, index: annoy.AnnoyIndex, keys: List[int]): + self.index = index + self.keys = keys + self.key_to_ann_id = {x: i for i, x in enumerate(self.keys)} + + @classmethod + def load(cls, index_dir: pathlib.Path) -> "ANNIndex": + index = load_index(index_dir / settings.INDEX_FILE_NAME) + keys = load_keys(index_dir / settings.KEYS_FILE_NAME) + return cls(index, keys) + + +INDEXES: Dict[str, ANNIndex] = { + index_dir.name: ANNIndex.load(index_dir) + for index_dir in settings.DATA_DIR.iterdir() + if index_dir.is_dir() +} class ANNResource: def on_get( self, req: falcon.Request, resp: falcon.Response, logo_id: Optional[int] = None ): + index_name = req.get_param("index", default=settings.DEFAULT_INDEX) + + if index_name not in INDEXES: + raise falcon.HTTPBadRequest("unknown index: {}".format(index_name)) + + ann_index = INDEXES[index_name] + if logo_id is None: - logo_id = KEYS[random.randint(0, len(KEYS) - 1)] + logo_id = ann_index.keys[random.randint(0, len(ann_index.keys) - 1)] - elif logo_id not in KEY_TO_ANN_ID: + elif logo_id not in ann_index.key_to_ann_id: resp.status = falcon.HTTP_404 return count = req.get_param_as_int("count", min_value=1, max_value=500, default=100) - item_index = KEY_TO_ANN_ID[logo_id] + item_index = ann_index.key_to_ann_id[logo_id] - indexes, distances = INDEX.get_nns_by_item( + indexes, distances = ann_index.index.get_nns_by_item( item_index, count, include_distances=True ) - logo_ids = [KEYS[index] for index in indexes] + logo_ids = [ann_index.keys[index] for index in indexes] results = [] for ann_logo_id, distance in zip(logo_ids, distances): @@ -61,6 +83,13 @@ def on_get( class ANNEmbeddingResource: def on_post(self, req: falcon.Request, resp: falcon.Response): + index_name = req.get_param("index", default=settings.DEFAULT_INDEX) + + if index_name not in INDEXES: + raise falcon.HTTPBadRequest("unknown index: {}".format(index_name)) + + ann_index = INDEXES[index_name] + count = req.media.get("count", 1) embedding = req.media["embedding"] @@ -72,11 +101,11 @@ def on_post(self, req: falcon.Request, resp: falcon.Response): ), ) - indexes, distances = INDEX.get_nns_by_vector( + indexes, distances = ann_index.index.get_nns_by_vector( embedding, count, include_distances=True ) - logo_ids = [KEYS[index] for index in indexes] + logo_ids = [ann_index.keys[index] for index in indexes] results = [] for ann_logo_id, distance in zip(logo_ids, distances): diff --git a/settings.py b/settings.py index 6e6b7e1..7cbe20a 100644 --- a/settings.py +++ b/settings.py @@ -6,5 +6,8 @@ SENTRY_DSN = os.environ.get("SENTRY_DSN") INDEX_DIM = 1280 -INDEX_PATH = DATA_DIR / "index.bin" -KEYS_PATH = DATA_DIR / "index.txt" + +INDEX_FILE_NAME = "index.bin" +KEYS_FILE_NAME = "index.txt" + +DEFAULT_INDEX = "efficientnet-b0" From 70dd5d8f166cc292d53bc8481fa74b006268f426 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= Date: Thu, 28 May 2020 10:18:01 +0200 Subject: [PATCH 05/30] Allow for various index dimension in ANN --- api.py | 7 ++++--- settings.py | 3 ++- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/api.py b/api.py index b13bec8..4e6252d 100644 --- a/api.py +++ b/api.py @@ -17,8 +17,8 @@ sentry_sdk.init(dsn=settings.SENTRY_DSN, integrations=[FalconIntegration()]) -def load_index(file_path: pathlib.Path) -> annoy.AnnoyIndex: - index = annoy.AnnoyIndex(settings.INDEX_DIM, "euclidean") +def load_index(file_path: pathlib.Path, dimension: int) -> annoy.AnnoyIndex: + index = annoy.AnnoyIndex(dimension, "euclidean") index.load(str(file_path), prefault=True) return index @@ -35,7 +35,8 @@ def __init__(self, index: annoy.AnnoyIndex, keys: List[int]): @classmethod def load(cls, index_dir: pathlib.Path) -> "ANNIndex": - index = load_index(index_dir / settings.INDEX_FILE_NAME) + dimension = settings.INDEX_DIM[index_dir.name] + index = load_index(index_dir / settings.INDEX_FILE_NAME, dimension) keys = load_keys(index_dir / settings.KEYS_FILE_NAME) return cls(index, keys) diff --git a/settings.py b/settings.py index 7cbe20a..e46c332 100644 --- a/settings.py +++ b/settings.py @@ -1,11 +1,12 @@ import os import pathlib +from typing import Dict PROJECT_DIR = pathlib.Path(__file__).parent DATA_DIR = PROJECT_DIR / "data" SENTRY_DSN = os.environ.get("SENTRY_DSN") -INDEX_DIM = 1280 +INDEX_DIM: Dict[str, int] = {"efficientnet-b0": 1280, "efficientnet-b5": 2048} INDEX_FILE_NAME = "index.bin" KEYS_FILE_NAME = "index.txt" From 9dad740d874a16552f9fd5a6001bfa5139127c12 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= Date: Tue, 9 Jun 2020 17:21:57 +0200 Subject: [PATCH 06/30] Add endpoint to save embeddings on disk --- api.py | 52 ++++++++++++--- embeddings.py | 169 +++++++++++++++++++++++++++++++++++++++++++++++ requirements.txt | 8 ++- schema.py | 27 ++++++++ settings.py | 4 ++ utils.py | 43 +++++++++++- 6 files changed, 291 insertions(+), 12 deletions(-) create mode 100644 embeddings.py create mode 100644 schema.py diff --git a/api.py b/api.py index 4e6252d..6732c2b 100644 --- a/api.py +++ b/api.py @@ -4,12 +4,16 @@ import annoy import falcon +from falcon.media.validators import jsonschema from falcon_cors import CORS from falcon_multipart.middleware import MultipartMiddleware +import numpy as np import sentry_sdk from sentry_sdk.integrations.falcon import FalconIntegration -from utils import get_logger, text_file_iter +from embeddings import add_logos, get_embedding +from utils import get_image_from_url, get_logger, text_file_iter +import schema import settings logger = get_logger() @@ -29,7 +33,7 @@ def load_keys(file_path: pathlib.Path) -> List[int]: class ANNIndex: def __init__(self, index: annoy.AnnoyIndex, keys: List[int]): - self.index = index + self.index: annoy.AnnoyIndex = index self.keys = keys self.key_to_ann_id = {x: i for i, x in enumerate(self.keys)} @@ -53,6 +57,7 @@ def on_get( self, req: falcon.Request, resp: falcon.Response, logo_id: Optional[int] = None ): index_name = req.get_param("index", default=settings.DEFAULT_INDEX) + count = req.get_param_as_int("count", min_value=1, max_value=500, default=100) if index_name not in INDEXES: raise falcon.HTTPBadRequest("unknown index: {}".format(index_name)) @@ -63,15 +68,21 @@ def on_get( logo_id = ann_index.keys[random.randint(0, len(ann_index.keys) - 1)] elif logo_id not in ann_index.key_to_ann_id: - resp.status = falcon.HTTP_404 - return + embedding = get_embedding(logo_id) - count = req.get_param_as_int("count", min_value=1, max_value=500, default=100) - item_index = ann_index.key_to_ann_id[logo_id] + if embedding is None: + resp.status = falcon.HTTP_404 + return - indexes, distances = ann_index.index.get_nns_by_item( - item_index, count, include_distances=True - ) + indexes, distances = ann_index.index.get_nns_by_vector( + embedding, count, include_distance=True + ) + + else: + item_index = ann_index.key_to_ann_id[logo_id] + indexes, distances = ann_index.index.get_nns_by_item( + item_index, count, include_distances=True + ) logo_ids = [ann_index.keys[index] for index in indexes] results = [] @@ -115,6 +126,28 @@ def on_post(self, req: falcon.Request, resp: falcon.Response): resp.media = {"results": results, "count": len(results)} +class AddLogoResource: + @jsonschema.validate(schema.ADD_LOGO_SCHEMA) + def on_post(self, req: falcon.Request, resp: falcon.Response): + image_url = req.media["image_url"] + logos = req.media["logos"] + logo_ids = [logo["id"] for logo in logos] + bounding_boxes = [logo["bounding_box"] for logo in logos] + + image = get_image_from_url(image_url) + + if image is None: + raise falcon.HTTPBadRequest("invalid image") + + if np.array(image).shape[-1] != 3: + image = image.convert("RGB") + + added = add_logos(image, logo_ids, bounding_boxes) + resp.media = { + "added": added, + } + + cors = CORS( allow_all_origins=True, allow_all_headers=True, @@ -132,3 +165,4 @@ def on_post(self, req: falcon.Request, resp: falcon.Response): api.add_route("/api/v1/ann/{logo_id:int}", ANNResource()) api.add_route("/api/v1/ann/random", ANNResource()) api.add_route("/api/v1/ann", ANNEmbeddingResource()) +api.add_route("/api/v1/ann/add", AddLogoResource()) diff --git a/embeddings.py b/embeddings.py new file mode 100644 index 0000000..b7c1d1d --- /dev/null +++ b/embeddings.py @@ -0,0 +1,169 @@ +import pathlib +from typing import Any, Dict, List, Optional, Tuple + +import h5py +import numpy as np +from PIL import Image + +from efficientnet_pytorch import EfficientNet +import torch + +import settings + + +class EmbeddingStore: + def __init__(self, hdf5_path: pathlib.Path): + self.hdf5_path = hdf5_path + self.logo_id_to_idx: Dict[int, int] = self.load() + self.offset = ( + max(self.logo_id_to_idx.values()) + 1 if self.logo_id_to_idx else 0 + ) + + def __contains__(self, logo_id: int) -> bool: + return self.get_index(logo_id) is not None + + def get_index(self, logo_id: int) -> Optional[int]: + return self.logo_id_to_idx.get(logo_id) + + def get_embedding(self, logo_id: int) -> Optional[np.ndarray]: + idx = self.get_index(logo_id) + + if idx is None: + return None + + if self.hdf5_path.is_file(): + with h5py.File(self.hdf5_path, "r") as f: + embedding_dset = f["embedding"] + return embedding_dset[idx] + + return None + + def load(self): + if self.hdf5_path.is_file(): + with h5py.File(self.hdf5_path, "r") as f: + external_id_dset = f["external_id"] + array = external_id_dset[:] + non_zero_indexes = np.flatnonzero(array) + array = array[: non_zero_indexes[-1] + 1] + return {int(x): i for i, x in enumerate(array)} + + return {} + + def save_embeddings( + self, embeddings: np.ndarray, external_ids: np.ndarray, + ): + file_exists = self.hdf5_path.is_file() + + with h5py.File(self.hdf5_path, "a") as f: + if not file_exists: + embedding_dset = f.create_dataset( + "embedding", + (settings.DEFAULT_HDF5_COUNT, embeddings.shape[-1]), + dtype="f", + chunks=True, + ) + external_id_dset = f.create_dataset( + "external_id", + (settings.DEFAULT_HDF5_COUNT,), + dtype="i", + chunks=True, + ) + else: + embedding_dset = f["embedding"] + external_id_dset = f["external_id"] + + slicing = slice(self.offset, self.offset + len(embeddings)) + embedding_dset[slicing] = embeddings + external_id_dset[slicing] = external_ids + + for external_id, idx in zip( + external_ids, range(self.offset, self.offset + len(embeddings)) + ): + self.logo_id_to_idx[external_id] = idx + + self.offset += len(embeddings) + + +EMBEDDING_STORE = EmbeddingStore(settings.EMBEDDINGS_HDF5_PATH) + + +def build_model(model_type: str): + return EfficientNet.from_pretrained(model_type) + + +def generate_embeddings(model, images: np.ndarray, device: torch.device) -> np.ndarray: + images = np.moveaxis(images, -1, 1) # move channel dim to 1st dim + + with torch.no_grad(): + torch_images = torch.tensor(images, dtype=torch.float32, device=device) + embeddings = model.extract_features(torch_images).cpu().numpy() + + return np.max(embeddings, (-1, -2)) + + +def crop_image( + image: Image.Image, bounding_box: Tuple[float, float, float, float] +) -> Image.Image: + y_min, x_min, y_max, x_max = bounding_box + (left, right, top, bottom) = ( + x_min * image.width, + x_max * image.width, + y_min * image.height, + y_max * image.height, + ) + return image.crop((left, top, right, bottom)) + + +def get_embedding(logo_id: int) -> Optional[np.ndarray]: + return EMBEDDING_STORE.get_embedding(logo_id) + + +def add_logos( + image: Image.Image, + external_ids: List[int], + bounding_boxes: List[Tuple[float, float, float, float]], + device: Optional[torch.device] = None, +) -> int: + if device is None: + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + model = ModelStore.get(settings.DEFAULT_MODEL, device) + image_dim = settings.IMAGE_INPUT_DIM[settings.DEFAULT_MODEL] + + selected_external_ids = [] + selected_bounding_boxes = [] + + for (bounding_box, external_id) in zip(bounding_boxes, external_ids): + if external_id in EMBEDDING_STORE: + continue + + selected_external_ids.append(external_id) + selected_bounding_boxes.append(bounding_box) + + if not selected_bounding_boxes: + return 0 + + images = np.zeros((len(selected_bounding_boxes), image_dim, image_dim, 3)) + for i, bounding_box in enumerate(selected_bounding_boxes): + cropped_image = crop_image(image, bounding_box) + cropped_image = cropped_image.resize((image_dim, image_dim)) + images[i] = np.array(cropped_image) + + embeddings = generate_embeddings(model, images, device) + EMBEDDING_STORE.save_embeddings( + embeddings, np.array(selected_external_ids, dtype="i") + ) + return len(embeddings) + + +class ModelStore: + store: Dict[str, Any] = {} + + @classmethod + def get(cls, model_name: str, device: torch.device): + if model_name not in cls.store: + model = build_model(model_name) + model = model.to(device) + cls.store[model_name] = model + + return cls.store[model_name] diff --git a/requirements.txt b/requirements.txt index 854950c..d199dae 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,4 +3,10 @@ gunicorn==20.0.4 falcon==2.0.0 falcon-cors==1.1.7 falcon-multipart==0.2.0 -sentry-sdk[falcon]==0.14.4 \ No newline at end of file +sentry-sdk[falcon]==0.14.4 +efficientnet_pytorch==0.6.3 +torch==1.5.0 +h5py==2.10.0 +Pillow==7.1.2 +requests==2.23.0 +jsonschema==3.2.0 \ No newline at end of file diff --git a/schema.py b/schema.py new file mode 100644 index 0000000..6d4d8b4 --- /dev/null +++ b/schema.py @@ -0,0 +1,27 @@ +from typing import Any, Dict + +ADD_LOGO_SCHEMA: Dict[str, Any] = { + "$schema": "http://json-schema.org/draft-07/schema#", + "title": "Add Logo", + "type": "object", + "properties": { + "logos": { + "type": "array", + "items": { + "type": "object", + "properties": { + "id": {"type": "integer"}, + "bounding_box": { + "type": "array", + "minItems": 4, + "maxItems": 4, + "items": {"type": "number"}, + }, + }, + "required": ["id", "bounding_box"], + }, + }, + "image_url": {"type": "string", "format": "uri"}, + }, + "required": ["image_url", "logos"], +} diff --git a/settings.py b/settings.py index e46c332..cd62853 100644 --- a/settings.py +++ b/settings.py @@ -7,8 +7,12 @@ SENTRY_DSN = os.environ.get("SENTRY_DSN") INDEX_DIM: Dict[str, int] = {"efficientnet-b0": 1280, "efficientnet-b5": 2048} +IMAGE_INPUT_DIM: Dict[str, int] = {"efficientnet-b0": 224} INDEX_FILE_NAME = "index.bin" KEYS_FILE_NAME = "index.txt" DEFAULT_INDEX = "efficientnet-b0" +DEFAULT_MODEL = "efficientnet-b0" +DEFAULT_HDF5_COUNT = 10000000 +EMBEDDINGS_HDF5_PATH = DATA_DIR / "efficientnet-b0.hdf5" diff --git a/utils.py b/utils.py index 3a2b30b..f7617b2 100644 --- a/utils.py +++ b/utils.py @@ -3,9 +3,12 @@ import logging import os import pathlib - import sys -from typing import Callable, Union, Iterable, Dict +import tempfile +from typing import Callable, Dict, Optional, Union, Iterable, Tuple + +from PIL import Image +import requests def get_logger(name=None, level: str = "INFO"): @@ -88,3 +91,39 @@ def text_file_iter(filepath: Union[str, pathlib.Path]) -> Iterable[str]: if item: yield item + + +def crop_image( + image: Image.Image, bounding_box: Tuple[float, float, float, float] +) -> Image.Image: + y_min, x_min, y_max, x_max = bounding_box + (left, right, top, bottom) = ( + x_min * image.width, + x_max * image.width, + y_min * image.height, + y_max * image.height, + ) + return image.crop((left, top, right, bottom)) + + +def get_image_from_url( + image_url: str, + error_raise: bool = False, + session: Optional[requests.Session] = None, +) -> Optional[Image.Image]: + if session: + r = session.get(image_url) + else: + r = requests.get(image_url) + + if error_raise: + r.raise_for_status() + + if r.status_code != 200: + return None + + with tempfile.NamedTemporaryFile() as f: + f.write(r.content) + image = Image.open(f.name) + + return image From 307b3169e6e517db0f10e5ed26cc56b0dc9b186d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= Date: Wed, 10 Jun 2020 17:08:30 +0200 Subject: [PATCH 07/30] Improve performance of logo addition call --- api.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/api.py b/api.py index 6732c2b..60b2eab 100644 --- a/api.py +++ b/api.py @@ -11,7 +11,7 @@ import sentry_sdk from sentry_sdk.integrations.falcon import FalconIntegration -from embeddings import add_logos, get_embedding +from embeddings import add_logos, get_embedding, EMBEDDING_STORE from utils import get_image_from_url, get_logger, text_file_iter import schema import settings @@ -132,6 +132,13 @@ def on_post(self, req: falcon.Request, resp: falcon.Response): image_url = req.media["image_url"] logos = req.media["logos"] logo_ids = [logo["id"] for logo in logos] + + if all(logo_id in EMBEDDING_STORE for logo_id in logo_ids): + resp.media = { + "added": 0, + } + return + bounding_boxes = [logo["bounding_box"] for logo in logos] image = get_image_from_url(image_url) From a01abbedc599bd6e0d8262e37cccce0df0dd5472 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= Date: Wed, 10 Jun 2020 17:09:08 +0200 Subject: [PATCH 08/30] Sort imports --- api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/api.py b/api.py index 60b2eab..4bbc5e1 100644 --- a/api.py +++ b/api.py @@ -1,5 +1,5 @@ -import random import pathlib +import random from typing import Dict, List, Optional import annoy From e819af4f3e33e2b846ce987f2161176293df6682 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= Date: Wed, 10 Jun 2020 17:10:18 +0200 Subject: [PATCH 09/30] Fix bug in /ann endpoint --- api.py | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/api.py b/api.py index 4bbc5e1..94317e3 100644 --- a/api.py +++ b/api.py @@ -64,10 +64,15 @@ def on_get( ann_index = INDEXES[index_name] - if logo_id is None: - logo_id = ann_index.keys[random.randint(0, len(ann_index.keys) - 1)] + if logo_id is None or logo_id in ann_index.key_to_ann_id: + if logo_id is None: + logo_id = ann_index.keys[random.randint(0, len(ann_index.keys) - 1)] - elif logo_id not in ann_index.key_to_ann_id: + item_index = ann_index.key_to_ann_id[logo_id] + indexes, distances = ann_index.index.get_nns_by_item( + item_index, count, include_distances=True + ) + else: embedding = get_embedding(logo_id) if embedding is None: @@ -78,12 +83,6 @@ def on_get( embedding, count, include_distance=True ) - else: - item_index = ann_index.key_to_ann_id[logo_id] - indexes, distances = ann_index.index.get_nns_by_item( - item_index, count, include_distances=True - ) - logo_ids = [ann_index.keys[index] for index in indexes] results = [] @@ -170,6 +169,6 @@ def on_post(self, req: falcon.Request, resp: falcon.Response): api.req_options.strip_url_path_trailing_slash = True api.req_options.auto_parse_qs_csv = True api.add_route("/api/v1/ann/{logo_id:int}", ANNResource()) -api.add_route("/api/v1/ann/random", ANNResource()) -api.add_route("/api/v1/ann", ANNEmbeddingResource()) +api.add_route("/api/v1/ann", ANNResource()) +api.add_route("/api/v1/ann/from_embedding", ANNEmbeddingResource()) api.add_route("/api/v1/ann/add", AddLogoResource()) From cd98c73136cbbdf72a21d9efbaea885f8abc4cf9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= Date: Thu, 11 Jun 2020 15:22:36 +0200 Subject: [PATCH 10/30] Add batch ANN endpoint --- api.py | 80 ++++++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 58 insertions(+), 22 deletions(-) diff --git a/api.py b/api.py index 94317e3..f168b61 100644 --- a/api.py +++ b/api.py @@ -1,6 +1,6 @@ import pathlib import random -from typing import Dict, List, Optional +from typing import Any, Dict, List, Optional import annoy import falcon @@ -11,7 +11,7 @@ import sentry_sdk from sentry_sdk.integrations.falcon import FalconIntegration -from embeddings import add_logos, get_embedding, EMBEDDING_STORE +from embeddings import add_logos, EMBEDDING_STORE, get_embedding from utils import get_image_from_url, get_logger, text_file_iter import schema import settings @@ -34,7 +34,7 @@ def load_keys(file_path: pathlib.Path) -> List[int]: class ANNIndex: def __init__(self, index: annoy.AnnoyIndex, keys: List[int]): self.index: annoy.AnnoyIndex = index - self.keys = keys + self.keys: List[int] = keys self.key_to_ann_id = {x: i for i, x in enumerate(self.keys)} @classmethod @@ -64,32 +64,67 @@ def on_get( ann_index = INDEXES[index_name] - if logo_id is None or logo_id in ann_index.key_to_ann_id: - if logo_id is None: - logo_id = ann_index.keys[random.randint(0, len(ann_index.keys) - 1)] + if logo_id is None: + logo_id = ann_index.keys[random.randint(0, len(ann_index.keys) - 1)] - item_index = ann_index.key_to_ann_id[logo_id] - indexes, distances = ann_index.index.get_nns_by_item( - item_index, count, include_distances=True - ) + results = get_nearest_neighbors(ann_index, count, logo_id) + + if results is None: + resp.status = falcon.HTTP_404 else: - embedding = get_embedding(logo_id) + resp.media = {"results": results, "count": len(results)} - if embedding is None: - resp.status = falcon.HTTP_404 - return - indexes, distances = ann_index.index.get_nns_by_vector( - embedding, count, include_distance=True - ) +class ANNBatchResource: + def on_get(self, req: falcon.Request, resp: falcon.Response): + index_name = req.get_param("index", default=settings.DEFAULT_INDEX) + count = req.get_param_as_int("count", min_value=1, max_value=500, default=100) + logo_ids = req.get_param_as_list( + "logo_ids", required=True, transform=int, default=[] + ) + if index_name not in INDEXES: + raise falcon.HTTPBadRequest("unknown index: {}".format(index_name)) - logo_ids = [ann_index.keys[index] for index in indexes] - results = [] + ann_index = INDEXES[index_name] + results = {} - for ann_logo_id, distance in zip(logo_ids, distances): - results.append({"distance": distance, "logo_id": ann_logo_id}) + for logo_id in logo_ids: + logo_results = get_nearest_neighbors(ann_index, count, logo_id) - resp.media = {"results": results, "count": len(results)} + if logo_results is not None: + results[logo_id] = logo_results + + resp.media = { + "results": results, + "count": len(results), + } + + +def get_nearest_neighbors( + ann_index: ANNIndex, count: int, logo_id: int +) -> Optional[List[Dict[str, Any]]]: + if logo_id in ann_index.key_to_ann_id: + item_index = ann_index.key_to_ann_id[logo_id] + indexes, distances = ann_index.index.get_nns_by_item( + item_index, count, include_distances=True + ) + else: + embedding = get_embedding(logo_id) + + if embedding is None: + return None + + indexes, distances = ann_index.index.get_nns_by_vector( + embedding, count, include_distance=True + ) + + logo_ids = [ann_index.keys[index] for index in indexes] + results = [] + + for ann_logo_id, distance in zip(logo_ids, distances): + results.append({"distance": distance, "logo_id": ann_logo_id}) + + return results class ANNEmbeddingResource: @@ -170,5 +205,6 @@ def on_post(self, req: falcon.Request, resp: falcon.Response): api.req_options.auto_parse_qs_csv = True api.add_route("/api/v1/ann/{logo_id:int}", ANNResource()) api.add_route("/api/v1/ann", ANNResource()) +api.add_route("/api/v1/ann/batch", ANNBatchResource()) api.add_route("/api/v1/ann/from_embedding", ANNEmbeddingResource()) api.add_route("/api/v1/ann/add", AddLogoResource()) From aa7fc4fc08d56141d150c86c0a981f7d8a536fa7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= Date: Thu, 11 Jun 2020 16:18:00 +0200 Subject: [PATCH 11/30] Fix get_logger level setting --- utils.py | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/utils.py b/utils.py index f7617b2..d351019 100644 --- a/utils.py +++ b/utils.py @@ -5,14 +5,26 @@ import pathlib import sys import tempfile -from typing import Callable, Dict, Optional, Union, Iterable, Tuple +from typing import Callable, Dict, Iterable, Optional, Tuple, Union from PIL import Image import requests -def get_logger(name=None, level: str = "INFO"): +def get_logger(name=None, level: Optional[int] = None): logger = logging.getLogger(name) + + if level is None: + log_level = os.environ.get("LOG_LEVEL", "INFO").upper() + level = logging.getLevelName(log_level) + + if not isinstance(level, int): + print( + "Unknown log level: {}, fallback to INFO".format(log_level), + file=sys.stderr, + ) + level = 20 + logger.setLevel(level) if name is None: @@ -21,17 +33,8 @@ def get_logger(name=None, level: str = "INFO"): return logger -def configure_root_logger(logger, level: str = "INFO"): - log_level = os.environ.get("LOG_LEVEL", "INFO").upper() - - if log_level not in ("DEBUG", "INFO", "WARNING", "ERROR", "FATAL", "CRITICAL"): - print( - "Unknown log level: {}, fallback " "to INFO".format(log_level), - file=sys.stderr, - ) - log_level = level - - logger.setLevel(log_level) +def configure_root_logger(logger, level: int = 20): + logger.setLevel(level) handler = logging.StreamHandler() formatter = logging.Formatter( "%(asctime)s :: %(processName)s :: " @@ -39,7 +42,7 @@ def configure_root_logger(logger, level: str = "INFO"): "%(message)s" ) handler.setFormatter(formatter) - handler.setLevel(log_level) + handler.setLevel(level) logger.addHandler(handler) From 998922761eccbc87e1786eca6f645596f9eb7630 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= Date: Fri, 12 Jun 2020 11:27:04 +0200 Subject: [PATCH 12/30] Fix bug in batch ANN endpoint --- api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/api.py b/api.py index f168b61..4eaf5b7 100644 --- a/api.py +++ b/api.py @@ -115,7 +115,7 @@ def get_nearest_neighbors( return None indexes, distances = ann_index.index.get_nns_by_vector( - embedding, count, include_distance=True + embedding, count, include_distances=True ) logo_ids = [ann_index.keys[index] for index in indexes] From f570ce1961a6876ca02aacca7866d34ec6bb3553 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= Date: Fri, 12 Jun 2020 22:09:09 +0200 Subject: [PATCH 13/30] Add logging messages --- api.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/api.py b/api.py index 4eaf5b7..81cc853 100644 --- a/api.py +++ b/api.py @@ -45,11 +45,13 @@ def load(cls, index_dir: pathlib.Path) -> "ANNIndex": return cls(index, keys) +logger.info("Loading ANN indexes...") INDEXES: Dict[str, ANNIndex] = { index_dir.name: ANNIndex.load(index_dir) for index_dir in settings.DATA_DIR.iterdir() if index_dir.is_dir() } +logger.info("Index loaded") class ANNResource: From d004250a60326b01f4c72a9220075f1987fec2c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= Date: Tue, 23 Jun 2020 19:32:36 +0100 Subject: [PATCH 14/30] Add new endpoint to give ANN count --- api.py | 6 ++++++ embeddings.py | 3 +++ 2 files changed, 9 insertions(+) diff --git a/api.py b/api.py index 81cc853..2e107b3 100644 --- a/api.py +++ b/api.py @@ -191,6 +191,11 @@ def on_post(self, req: falcon.Request, resp: falcon.Response): } +class ANNCountResource: + def on_get(self, req: falcon.Request, resp: falcon.Response): + req.media = {"count": len(EMBEDDING_STORE)} + + cors = CORS( allow_all_origins=True, allow_all_headers=True, @@ -210,3 +215,4 @@ def on_post(self, req: falcon.Request, resp: falcon.Response): api.add_route("/api/v1/ann/batch", ANNBatchResource()) api.add_route("/api/v1/ann/from_embedding", ANNEmbeddingResource()) api.add_route("/api/v1/ann/add", AddLogoResource()) +api.add_route("/api/v1/ann/count", ANNCountResource()) diff --git a/embeddings.py b/embeddings.py index b7c1d1d..80a0050 100644 --- a/embeddings.py +++ b/embeddings.py @@ -19,6 +19,9 @@ def __init__(self, hdf5_path: pathlib.Path): max(self.logo_id_to_idx.values()) + 1 if self.logo_id_to_idx else 0 ) + def __len__(self): + return len(self.logo_id_to_idx) + def __contains__(self, logo_id: int) -> bool: return self.get_index(logo_id) is not None From a1f22434e1054c7fc30ecee56b2decd5f70c0d6e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= Date: Mon, 29 Jun 2020 10:50:12 +0200 Subject: [PATCH 15/30] Add new endpoint to ANN API to get stored logo IDs --- api.py | 6 ++++++ embeddings.py | 5 ++++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/api.py b/api.py index 2e107b3..627cd88 100644 --- a/api.py +++ b/api.py @@ -196,6 +196,11 @@ def on_get(self, req: falcon.Request, resp: falcon.Response): req.media = {"count": len(EMBEDDING_STORE)} +class ANNStoredLogoResource: + def on_get(self, req: falcon.Request, resp: falcon.Response): + req.media = {"stored": list(EMBEDDING_STORE.get_logo_ids())} + + cors = CORS( allow_all_origins=True, allow_all_headers=True, @@ -216,3 +221,4 @@ def on_get(self, req: falcon.Request, resp: falcon.Response): api.add_route("/api/v1/ann/from_embedding", ANNEmbeddingResource()) api.add_route("/api/v1/ann/add", AddLogoResource()) api.add_route("/api/v1/ann/count", ANNCountResource()) +api.add_route("/api/v1/ann/stored", ANNStoredLogoResource()) diff --git a/embeddings.py b/embeddings.py index 80a0050..20d9aa5 100644 --- a/embeddings.py +++ b/embeddings.py @@ -1,5 +1,5 @@ import pathlib -from typing import Any, Dict, List, Optional, Tuple +from typing import Any, Dict, Iterable, List, Optional, Tuple import h5py import numpy as np @@ -25,6 +25,9 @@ def __len__(self): def __contains__(self, logo_id: int) -> bool: return self.get_index(logo_id) is not None + def get_logo_ids(self) -> Iterable[int]: + return self.logo_id_to_idx.keys() + def get_index(self, logo_id: int) -> Optional[int]: return self.logo_id_to_idx.get(logo_id) From da94d2a1bb21532bc998ff6417cb29ef3e4792cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= Date: Mon, 29 Jun 2020 11:19:40 +0200 Subject: [PATCH 16/30] Fix bug in ANN API --- api.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/api.py b/api.py index 627cd88..8b7d7da 100644 --- a/api.py +++ b/api.py @@ -193,12 +193,12 @@ def on_post(self, req: falcon.Request, resp: falcon.Response): class ANNCountResource: def on_get(self, req: falcon.Request, resp: falcon.Response): - req.media = {"count": len(EMBEDDING_STORE)} + resp.media = {"count": len(EMBEDDING_STORE)} class ANNStoredLogoResource: def on_get(self, req: falcon.Request, resp: falcon.Response): - req.media = {"stored": list(EMBEDDING_STORE.get_logo_ids())} + resp.media = {"stored": list(EMBEDDING_STORE.get_logo_ids())} cors = CORS( From 429145c4ab3bfaaf1fc566e4a3a72c186f2d6431 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= Date: Mon, 29 Jun 2020 14:19:36 +0200 Subject: [PATCH 17/30] Fix serialization bug --- embeddings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/embeddings.py b/embeddings.py index 20d9aa5..1cf42a6 100644 --- a/embeddings.py +++ b/embeddings.py @@ -85,7 +85,7 @@ def save_embeddings( for external_id, idx in zip( external_ids, range(self.offset, self.offset + len(embeddings)) ): - self.logo_id_to_idx[external_id] = idx + self.logo_id_to_idx[int(external_id)] = idx self.offset += len(embeddings) From 94e700d2d32b6b17fd882be81d00950e34ff0a48 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= Date: Mon, 29 Jun 2020 14:21:09 +0200 Subject: [PATCH 18/30] [ann] Add script to generate index --- embeddings.py | 16 ++++++++++++++++ manage.py | 39 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+) create mode 100644 manage.py diff --git a/embeddings.py b/embeddings.py index 1cf42a6..ac56bd8 100644 --- a/embeddings.py +++ b/embeddings.py @@ -1,3 +1,4 @@ +import operator import pathlib from typing import Any, Dict, Iterable, List, Optional, Tuple @@ -55,6 +56,21 @@ def load(self): return {} + def iter_embeddings(self) -> Iterable[Tuple[int, np.ndarray]]: + if not self.hdf5_path.is_file(): + return + + idx_logo_id = sorted( + ((idx, logo_id) for logo_id, idx in self.logo_id_to_idx.items()), + key=operator.itemgetter(0), + ) + + with h5py.File(self.hdf5_path, "r") as f: + embedding_dset = f["embedding"] + for idx, logo_id in idx_logo_id: + embedding = embedding_dset[idx] + yield logo_id, embedding + def save_embeddings( self, embeddings: np.ndarray, external_ids: np.ndarray, ): diff --git a/manage.py b/manage.py new file mode 100644 index 0000000..caa4ee6 --- /dev/null +++ b/manage.py @@ -0,0 +1,39 @@ +if __name__ == "__main__": + import pathlib + + import click + + @click.group() + def cli(): + pass + + @click.command() + @click.argument("output", type=pathlib.Path) + @click.option("--tree-count", type=int, default=100) + def generate_index(output: pathlib.Path, tree_count: int): + from annoy import AnnoyIndex + from embeddings import EMBEDDING_STORE + + index = None + offset: int = 0 + keys = [] + + for logo_id, embedding in EMBEDDING_STORE.iter_embeddings(): + if index is None: + output_dim = embedding.shape[-1] + index = AnnoyIndex(output_dim, "euclidean") + + index.add_item(offset, embedding) + keys.append(int(logo_id)) + offset += 1 + + if index is not None: + index.build(tree_count) + index.save(str(output)) + + with output.with_suffix(".txt").open("w") as f: + for key in keys: + f.write(str(key) + "\n") + + cli.add_command(generate_index) + cli() From dacf1c7113305c2b305362ee0b032aa7d0a637dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= Date: Mon, 29 Jun 2020 17:59:14 +0200 Subject: [PATCH 19/30] [ann] Improve index generation script --- manage.py | 47 ++++++++++++++++++++++++++++------------------- 1 file changed, 28 insertions(+), 19 deletions(-) diff --git a/manage.py b/manage.py index caa4ee6..fa0ba1d 100644 --- a/manage.py +++ b/manage.py @@ -11,29 +11,38 @@ def cli(): @click.argument("output", type=pathlib.Path) @click.option("--tree-count", type=int, default=100) def generate_index(output: pathlib.Path, tree_count: int): + import shutil + import tempfile + from annoy import AnnoyIndex - from embeddings import EMBEDDING_STORE + from embeddings import EmbeddingStore + import settings + + with tempfile.TemporaryDirectory() as tmp_dir: + embedding_path = pathlib.Path(tmp_dir) / "embeddings.hdf5" + shutil.copy(str(settings.EMBEDDINGS_HDF5_PATH), str(embedding_path)) + embedding_store = EmbeddingStore(embedding_path) - index = None - offset: int = 0 - keys = [] + index = None + offset: int = 0 + keys = [] - for logo_id, embedding in EMBEDDING_STORE.iter_embeddings(): - if index is None: - output_dim = embedding.shape[-1] - index = AnnoyIndex(output_dim, "euclidean") + for logo_id, embedding in embedding_store.iter_embeddings(): + if index is None: + output_dim = embedding.shape[-1] + index = AnnoyIndex(output_dim, "euclidean") - index.add_item(offset, embedding) - keys.append(int(logo_id)) - offset += 1 + index.add_item(offset, embedding) + keys.append(int(logo_id)) + offset += 1 - if index is not None: - index.build(tree_count) - index.save(str(output)) + if index is not None: + index.build(tree_count) + index.save(str(output)) - with output.with_suffix(".txt").open("w") as f: - for key in keys: - f.write(str(key) + "\n") + with output.with_suffix(".txt").open("w") as f: + for key in keys: + f.write(str(key) + "\n") - cli.add_command(generate_index) - cli() + cli.add_command(generate_index) + cli() From 84064bc5de86ff978266625eec4d0293670398c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= Date: Tue, 30 Jun 2020 12:10:55 +0200 Subject: [PATCH 20/30] Improve index generation script --- manage.py | 21 ++++++++++++++++++--- requirements.txt | 4 +++- 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/manage.py b/manage.py index fa0ba1d..291a7f3 100644 --- a/manage.py +++ b/manage.py @@ -15,19 +15,28 @@ def generate_index(output: pathlib.Path, tree_count: int): import tempfile from annoy import AnnoyIndex + import tqdm + from embeddings import EmbeddingStore import settings + from utils import get_logger + + logger = get_logger() with tempfile.TemporaryDirectory() as tmp_dir: embedding_path = pathlib.Path(tmp_dir) / "embeddings.hdf5" + logger.info(f"Copying embedding file to {embedding_path}...") shutil.copy(str(settings.EMBEDDINGS_HDF5_PATH), str(embedding_path)) + + logger.info(f"Loading {embedding_path}...") embedding_store = EmbeddingStore(embedding_path) index = None offset: int = 0 keys = [] - for logo_id, embedding in embedding_store.iter_embeddings(): + logger.info("Adding embeddings to index...") + for logo_id, embedding in tqdm.tqdm(embedding_store.iter_embeddings()): if index is None: output_dim = embedding.shape[-1] index = AnnoyIndex(output_dim, "euclidean") @@ -36,13 +45,19 @@ def generate_index(output: pathlib.Path, tree_count: int): keys.append(int(logo_id)) offset += 1 + logger.info("Building index...") if index is not None: index.build(tree_count) index.save(str(output)) + logger.info("Index built.") + logger.info("Saving keys...") + with output.with_suffix(".txt").open("w") as f: for key in keys: f.write(str(key) + "\n") - cli.add_command(generate_index) - cli() + logger.info("Keys saved.") + + cli.add_command(generate_index) + cli() diff --git a/requirements.txt b/requirements.txt index d199dae..73002e4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,4 +9,6 @@ torch==1.5.0 h5py==2.10.0 Pillow==7.1.2 requests==2.23.0 -jsonschema==3.2.0 \ No newline at end of file +jsonschema==3.2.0 +click==7.1.2 +tqdm==4.47.0 \ No newline at end of file From c2a35477c8255d25a2208a51310e5eeb6f021511 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= Date: Wed, 15 Jul 2020 13:47:17 +0200 Subject: [PATCH 21/30] Improve Dockerfile configuration --- Dockerfile | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index 5bde8ef..588961b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,10 +2,15 @@ FROM python:3.7-slim WORKDIR /opt/ann +RUN apt-get update && \ + apt-get install --no-install-suggests --no-install-recommends -y build-essential && \ + apt-get autoremove --purge && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + COPY *.py /opt/ann/ COPY requirements.txt /opt/ann/ - -RUN apt-get update && apt-get install --no-install-recommends -y build-essential && pip3 install -r /opt/ann/requirements.txt +RUN pip3 install --no-cache-dir -r /opt/ann/requirements.txt WORKDIR /opt/ann ENTRYPOINT ["/usr/local/bin/gunicorn", "--config", "/opt/ann/gunicorn.py", "api:api"] From 4be14895259ebd879b369730dc60b840065926fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= Date: Fri, 25 Sep 2020 14:42:44 +0200 Subject: [PATCH 22/30] Upgrade black to version 20.8b1 --- Dockerfile | 2 +- embeddings.py | 4 +++- gunicorn.py | 3 --- 3 files changed, 4 insertions(+), 5 deletions(-) delete mode 100644 gunicorn.py diff --git a/Dockerfile b/Dockerfile index 588961b..0ceb8d0 100644 --- a/Dockerfile +++ b/Dockerfile @@ -13,4 +13,4 @@ COPY requirements.txt /opt/ann/ RUN pip3 install --no-cache-dir -r /opt/ann/requirements.txt WORKDIR /opt/ann -ENTRYPOINT ["/usr/local/bin/gunicorn", "--config", "/opt/ann/gunicorn.py", "api:api"] +ENTRYPOINT ["/usr/local/bin/gunicorn", "--config", "/opt/ann/gunicorn_conf.py", "api:api"] diff --git a/embeddings.py b/embeddings.py index ac56bd8..ae5898e 100644 --- a/embeddings.py +++ b/embeddings.py @@ -72,7 +72,9 @@ def iter_embeddings(self) -> Iterable[Tuple[int, np.ndarray]]: yield logo_id, embedding def save_embeddings( - self, embeddings: np.ndarray, external_ids: np.ndarray, + self, + embeddings: np.ndarray, + external_ids: np.ndarray, ): file_exists = self.hdf5_path.is_file() diff --git a/gunicorn.py b/gunicorn.py deleted file mode 100644 index a2f2287..0000000 --- a/gunicorn.py +++ /dev/null @@ -1,3 +0,0 @@ -bind = ":5501" -workers = 1 -timeout = 60 From c41c7fe4f4670cc9f78c317cd481f7c2d60186ca Mon Sep 17 00:00:00 2001 From: Wauplin Date: Fri, 4 Jun 2021 11:03:01 +0200 Subject: [PATCH 23/30] Add poetry + some project changes (#315) * switch to pyproject.toml + add isort * Tmp add branch to github workflow * Change pipeline name * force trigger pipeline ? * fix toml sort step * updated flake8/mypy config * isort config + fix in files * FIX cmd in pipeline * fix tests in gh pipeline * updated project settings * remove setup.py * switch to pyproject.toml + add isort * Tmp add branch to github workflow * Change pipeline name * force trigger pipeline ? * fix toml sort step * updated flake8/mypy config * isort config + fix in files * FIX cmd in pipeline * fix tests in gh pipeline * updated project settings * remove setup.py * Rebase master + fix isort * Script to download nutriscore weights * remove python3.9 from github CI * changed cache in CI * github flake8 action * integrate typer * isort * flake8. * fix cli in workflow * Add pytest coverage * remove codecov upload * Build documentation using mkdocs * FIXed pipeline * Auto-generate CLI reference * fix workflow * update link in doc * Documentation only on master * change sonarcloud pipeline name * codecov config * codecov * Update documentation + sync with readme --- api.py | 11 +++++------ embeddings.py | 8 +++----- manage.py | 5 ++--- utils.py | 2 +- 4 files changed, 11 insertions(+), 15 deletions(-) diff --git a/api.py b/api.py index 8b7d7da..a2c87be 100644 --- a/api.py +++ b/api.py @@ -4,17 +4,16 @@ import annoy import falcon +import numpy as np +import schema +import sentry_sdk +import settings +from embeddings import EMBEDDING_STORE, add_logos, get_embedding from falcon.media.validators import jsonschema from falcon_cors import CORS from falcon_multipart.middleware import MultipartMiddleware -import numpy as np -import sentry_sdk from sentry_sdk.integrations.falcon import FalconIntegration - -from embeddings import add_logos, EMBEDDING_STORE, get_embedding from utils import get_image_from_url, get_logger, text_file_iter -import schema -import settings logger = get_logger() diff --git a/embeddings.py b/embeddings.py index ae5898e..34d9715 100644 --- a/embeddings.py +++ b/embeddings.py @@ -4,12 +4,10 @@ import h5py import numpy as np -from PIL import Image - -from efficientnet_pytorch import EfficientNet -import torch - import settings +import torch +from efficientnet_pytorch import EfficientNet +from PIL import Image class EmbeddingStore: diff --git a/manage.py b/manage.py index 291a7f3..e8e20fd 100644 --- a/manage.py +++ b/manage.py @@ -14,11 +14,10 @@ def generate_index(output: pathlib.Path, tree_count: int): import shutil import tempfile - from annoy import AnnoyIndex + import settings import tqdm - + from annoy import AnnoyIndex from embeddings import EmbeddingStore - import settings from utils import get_logger logger = get_logger() diff --git a/utils.py b/utils.py index d351019..419cd28 100644 --- a/utils.py +++ b/utils.py @@ -7,8 +7,8 @@ import tempfile from typing import Callable, Dict, Iterable, Optional, Tuple, Union -from PIL import Image import requests +from PIL import Image def get_logger(name=None, level: Optional[int] = None): From bc9a2ea876b294e8cfba8c9d05382e09b50d17e2 Mon Sep 17 00:00:00 2001 From: Yana K Date: Mon, 2 Aug 2021 17:27:11 +0200 Subject: [PATCH 24/30] Parameterise Sentry initialisation to allow setting environments. --- api.py | 3 +-- settings.py | 28 ++++++++++++++++++++++++++-- 2 files changed, 27 insertions(+), 4 deletions(-) diff --git a/api.py b/api.py index a2c87be..8fced75 100644 --- a/api.py +++ b/api.py @@ -6,7 +6,6 @@ import falcon import numpy as np import schema -import sentry_sdk import settings from embeddings import EMBEDDING_STORE, add_logos, get_embedding from falcon.media.validators import jsonschema @@ -17,7 +16,7 @@ logger = get_logger() -sentry_sdk.init(dsn=settings.SENTRY_DSN, integrations=[FalconIntegration()]) +settings.init_sentry(integrations=[FalconIntegration()]) def load_index(file_path: pathlib.Path, dimension: int) -> annoy.AnnoyIndex: diff --git a/settings.py b/settings.py index cd62853..36b0251 100644 --- a/settings.py +++ b/settings.py @@ -1,11 +1,12 @@ import os import pathlib -from typing import Dict +import sentry_sdk +from sentry_sdk.integrations import Integration +from typing import Dict, Sequence, Optional PROJECT_DIR = pathlib.Path(__file__).parent DATA_DIR = PROJECT_DIR / "data" -SENTRY_DSN = os.environ.get("SENTRY_DSN") INDEX_DIM: Dict[str, int] = {"efficientnet-b0": 1280, "efficientnet-b5": 2048} IMAGE_INPUT_DIM: Dict[str, int] = {"efficientnet-b0": 224} @@ -16,3 +17,26 @@ DEFAULT_MODEL = "efficientnet-b0" DEFAULT_HDF5_COUNT = 10000000 EMBEDDINGS_HDF5_PATH = DATA_DIR / "efficientnet-b0.hdf5" + +# Should be either 'prod' or 'dev'. +_ann_instance = os.environ.get("ANN_INSTANCE", "prod") + +if _ann_instance != "prod" and _ann_instance != "dev": + raise ValueError( + "ANN_INSTANCE should be either 'prod' or 'dev', got %s" % _ann_instance + ) + +_sentry_dsn = os.environ.get("SENTRY_DSN") + + +def init_sentry(integrations: Sequence[Integration] = None): + if _sentry_dsn: + sentry_sdk.init( + _sentry_dsn, + environment=_ann_instance, + integrations=integrations, + ) + else: + raise ValueError( + "init_sentry was requested, yet SENTRY_DSN env variable was not provided" + ) From dce73c40ed288eaa84e768f711ed6c9a62163aa2 Mon Sep 17 00:00:00 2001 From: Yana K Date: Tue, 3 Aug 2021 10:34:27 +0200 Subject: [PATCH 25/30] Lint the files --- settings.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/settings.py b/settings.py index 36b0251..52442b1 100644 --- a/settings.py +++ b/settings.py @@ -1,8 +1,9 @@ import os import pathlib +from typing import Dict, Sequence + import sentry_sdk from sentry_sdk.integrations import Integration -from typing import Dict, Sequence, Optional PROJECT_DIR = pathlib.Path(__file__).parent DATA_DIR = PROJECT_DIR / "data" From d419f3be1e07338bc5ddd63a118ac5bb13a154cc Mon Sep 17 00:00:00 2001 From: Yana K Date: Thu, 12 Aug 2021 09:58:03 +0200 Subject: [PATCH 26/30] Allow to not post anything on Slack if the robotoff is running as dev/locally (#351) * Refactor slack functionality to avoid sending notifications on test runs. * Add tests * Add skeleton test for the SlackNotificationsFactory * Remove WIP comment * Fix mypy error --- settings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/settings.py b/settings.py index 52442b1..63b15e3 100644 --- a/settings.py +++ b/settings.py @@ -30,7 +30,7 @@ _sentry_dsn = os.environ.get("SENTRY_DSN") -def init_sentry(integrations: Sequence[Integration] = None): +def init_sentry(integrations: Sequence[Integration] = ()): if _sentry_dsn: sentry_sdk.init( _sentry_dsn, From a52fb6fe580aa0196282ccb07e5f9677af1b6a80 Mon Sep 17 00:00:00 2001 From: Yana K Date: Thu, 12 Aug 2021 10:20:11 +0200 Subject: [PATCH 27/30] Optimise image cropping by constructing the Image object directly from the response bytes (#345) --- utils.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/utils.py b/utils.py index 419cd28..cf4d323 100644 --- a/utils.py +++ b/utils.py @@ -4,7 +4,7 @@ import os import pathlib import sys -import tempfile +from io import BytesIO from typing import Callable, Dict, Iterable, Optional, Tuple, Union import requests @@ -125,8 +125,4 @@ def get_image_from_url( if r.status_code != 200: return None - with tempfile.NamedTemporaryFile() as f: - f.write(r.content) - image = Image.open(f.name) - - return image + return Image.open(BytesIO(r.content)) From 09c4ed429766b60dd2ea9a725ee8a455610ba12c Mon Sep 17 00:00:00 2001 From: Yana K Date: Thu, 12 Aug 2021 14:34:09 +0200 Subject: [PATCH 28/30] Add the missing gunicorn conf file (#356) --- gunicorn_conf.py | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 gunicorn_conf.py diff --git a/gunicorn_conf.py b/gunicorn_conf.py new file mode 100644 index 0000000..265f587 --- /dev/null +++ b/gunicorn_conf.py @@ -0,0 +1,3 @@ +bind = ":5501" +workers = 1 +timeout = 60 \ No newline at end of file From cafeca49e0646b629dccf8dbb81c70c3faf1c8f0 Mon Sep 17 00:00:00 2001 From: Yana K Date: Fri, 13 Aug 2021 15:10:13 +0200 Subject: [PATCH 29/30] Default to 'dev' for the Robotoff instance (#354) --- settings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/settings.py b/settings.py index 63b15e3..6ffe8d1 100644 --- a/settings.py +++ b/settings.py @@ -20,7 +20,7 @@ EMBEDDINGS_HDF5_PATH = DATA_DIR / "efficientnet-b0.hdf5" # Should be either 'prod' or 'dev'. -_ann_instance = os.environ.get("ANN_INSTANCE", "prod") +_ann_instance = os.environ.get("ANN_INSTANCE", "dev") if _ann_instance != "prod" and _ann_instance != "dev": raise ValueError( From ec7db2a58897c332361eaf3be4eba70f1bde9a07 Mon Sep 17 00:00:00 2001 From: Yana K Date: Tue, 17 Aug 2021 09:47:05 +0200 Subject: [PATCH 30/30] Hardcodes (#362) * Properly format code and remove more hardcodes * Fix missed hardcode --- gunicorn_conf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gunicorn_conf.py b/gunicorn_conf.py index 265f587..a2f2287 100644 --- a/gunicorn_conf.py +++ b/gunicorn_conf.py @@ -1,3 +1,3 @@ bind = ":5501" workers = 1 -timeout = 60 \ No newline at end of file +timeout = 60