From 55b85024395a956e928c7fffb5fa2bae2e57ba82 Mon Sep 17 00:00:00 2001
From: cmungall <cjm@berkeleybop.org>
Date: Fri, 11 Aug 2023 12:34:12 -0700
Subject: [PATCH] Adding pubmed

---
 Makefile                                 |   6 +
 adhoc.Makefile                           |   6 +-
 poetry.lock                              |   2 +-
 pyproject.toml                           |   1 +
 src/curate_gpt/agents/chat.py            |   1 +
 src/curate_gpt/agents/pubmed.py          | 179 +++++++++++++++++++++++
 src/curate_gpt/app/app.py                |  72 +++++++--
 src/curate_gpt/app/cases.yaml            |  35 +++++
 src/curate_gpt/app/helper.py             |  35 +++++
 src/curate_gpt/cli.py                    |  79 +++++++++-
 src/curate_gpt/store/chromadb_adapter.py |  12 +-
 src/curate_gpt/store/db_adapter.py       |  12 +-
 12 files changed, 417 insertions(+), 23 deletions(-)
 create mode 100644 src/curate_gpt/agents/pubmed.py
 create mode 100644 src/curate_gpt/app/cases.yaml
 create mode 100644 src/curate_gpt/app/helper.py

diff --git a/Makefile b/Makefile
index 0b3fbef..0d3e6ff 100644
--- a/Makefile
+++ b/Makefile
@@ -8,10 +8,16 @@ ONTS = cl uberon obi go envo hp mp mondo po
 all: index_all_oai
 
 index_all_oai: $(patsubst %,terms-oai-%,$(ONTS))
+index_all2_oai: $(patsubst %,terms_defns-oai-%,$(ONTS))
 
 terms-oai-%:
 	$(CURATE) ontology index -p $(DB_PATH) -c terms_$*_oai -m openai: sqlite:obo:$*
 
+
+terms_defns-oai-%:
+	$(CURATE) ontology index --index-fields label,definition,relationships -p $(DB_PATH) -c terms_defns_$*_oai -m openai: sqlite:obo:$*
+
+
 terms-default-%:
 	$(CURATE) ontology index -p $(DB_PATH) -c terms_$* sqlite:obo:$*
 
diff --git a/adhoc.Makefile b/adhoc.Makefile
index 066fd0f..3b824d2 100644
--- a/adhoc.Makefile
+++ b/adhoc.Makefile
@@ -1,8 +1,12 @@
 RUN = poetry run
+DB_PATH = db
 
 data/nmdc.json:
 	$(RUN) python -m curate_gpt.adhoc.nmdc_sample_downloader --no-stream  --format json > $@
 
 
 index-nmdc: data/nmdc.json
-	$(RUN) curategpt -v index -c nmdc $<
+	$(RUN) curategpt -v index -p $(DB_PATH) -m openai: -c biosamples_nmdc_oai --object-type Biosample --description "Samples taken from NMDC database" $<
+
+index-obi-issues:
+	$(RUN) curategpt index -c github_issues_obi_oai -m openai: ../formal-ontology-analysis/repo-dirs/metadata/*.json
diff --git a/poetry.lock b/poetry.lock
index 197657a..331fd06 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -7520,4 +7520,4 @@ docs = []
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9, !=3.9.7"
-content-hash = "19f03da820d4a50752a975b58c9c9fa3ab9a59a8e88cd2b14e72d206c7bad544"
+content-hash = "1e27f26ce2f8bd5e9df25ddbab685d0948b9e9d63d6c66899a225a3e68b9e726"
diff --git a/pyproject.toml b/pyproject.toml
index 127124d..e45fa5b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -27,6 +27,7 @@ replicate = "^0.11.0"
 sqlite-utils = "^3.34"
 gpt4all = "^1.0.8"
 httpx = "^0.24.1"
+eutils = "^0.6.0"
 
 [tool.poetry.dev-dependencies]
 pytest = ">=7.1.2"
diff --git a/src/curate_gpt/agents/chat.py b/src/curate_gpt/agents/chat.py
index 98f6817..67d1673 100644
--- a/src/curate_gpt/agents/chat.py
+++ b/src/curate_gpt/agents/chat.py
@@ -67,6 +67,7 @@ def chat(
         texts = []
         i = 0
         references = {}
+        logger.info(f"Chat: {query} on {self.kb_adapter} kwargs: {kwargs}")
         for obj, _, obj_meta in self.kb_adapter.search(
                 query, relevance_factor=self.relevance_factor, **kwargs
         ):
diff --git a/src/curate_gpt/agents/pubmed.py b/src/curate_gpt/agents/pubmed.py
new file mode 100644
index 0000000..96bb792
--- /dev/null
+++ b/src/curate_gpt/agents/pubmed.py
@@ -0,0 +1,179 @@
+"""Chat with a KB."""
+import json
+import logging
+import re
+import time
+from collections import defaultdict
+from dataclasses import dataclass
+from typing import Any, Dict, List, ClassVar, Optional, Iterator
+
+import requests
+import yaml
+from eutils import Client
+from pydantic import BaseModel
+
+from curate_gpt.agents.chat import ChatEngine, ChatResponse
+from curate_gpt.extract import AnnotatedObject, Extractor
+from curate_gpt.store import DBAdapter
+from curate_gpt.store.db_adapter import SEARCH_RESULT
+
+logger = logging.getLogger(__name__)
+
+ESEARCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
+EFETCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
+
+PUBMED_COLLECTION_NAME = "pubmed_subset"
+PUBMED_TEMP_COLLECTION_NAME = "__pubmed_temp__"
+PUBMED_EMBEDDING_MODEL = "openai:"
+
+
+@dataclass
+class PubmedAgent:
+    """
+    An agent to pull from pubmed.
+
+    TODO: make this a virtual store
+    """
+
+    local_store: DBAdapter = None
+    """Adapter to local knowledge base"""
+
+    eutils_client: Client = None
+
+    extractor: Extractor = None
+
+    def search(
+        self,
+        text: str,
+        collection: str = None,
+        cache: bool = True,
+        expand: bool = True,
+        **kwargs,
+    ) -> Iterator[SEARCH_RESULT]:
+        """
+        Extract structured object using text seed and background knowledge.
+
+        :param text:
+        :param kwargs:
+        :return:
+        """
+        if collection is None:
+            collection = PUBMED_COLLECTION_NAME
+        logger.info(f"Searching for {text}, caching in {collection}")
+        if expand:
+            logger.info(f"Expanding search term: {text} to create pubmed query")
+            model = self.extractor.model
+            response = model.prompt(text, system="generate a semi-colon separated list of the most relevant terms")
+            terms = response.text().split(";")
+            search_term = " OR ".join(terms)
+        else:
+            search_term = text
+        logger.info(f"Constructed search term: {search_term}")
+        # Parameters for the request
+        params = {
+            "db": "pubmed",
+            "term": search_term,
+            "retmax": 100,
+            "sort": "relevance",
+            "retmode": "json"
+        }
+
+        time.sleep(0.5)
+        response = requests.get(ESEARCH_URL, params=params)
+        data = response.json()
+
+        # Extract PubMed IDs from the response
+        pubmed_ids = data["esearchresult"]["idlist"]
+
+        if not pubmed_ids:
+            logger.warning(f"No results with {search_term}")
+            if expand:
+                logger.info(f"Trying again without expansion")
+                return self.search(text, collection=collection, cache=cache, expand=False, **kwargs)
+            else:
+                logger.error(f"Failed to find results for {text}")
+                return
+
+        logger.info(f"Found {len(pubmed_ids)} results: {pubmed_ids}")
+
+        efetch_params = {
+            "db": "pubmed",
+            "id": ",".join(pubmed_ids),  # Combine PubMed IDs into a comma-separated string
+            "retmode": "json"
+        }
+
+        # Parameters for the efetch request
+        efetch_params = {
+            "db": "pubmed",
+            "id": ",".join(pubmed_ids),  # Combine PubMed IDs into a comma-separated string
+            "rettype": "medline",
+            "retmode": "text"
+        }
+        efetch_response = requests.get(EFETCH_URL, params=efetch_params)
+        medline_records = efetch_response.text
+
+        # Parsing titles and abstracts from the MEDLINE records
+        parsed_data = []
+        current_record = {}
+        current_field = None
+
+        for line in medline_records.split("\n"):
+            if line.startswith("PMID- "):
+                current_field = "id"
+                current_record[current_field] = "PMID:" + line.replace("PMID- ", "").strip()
+            if line.startswith("PMC - "):
+                current_field = "pmcid"
+                current_record[current_field] = "PMCID:" + line.replace("PMC - ", "").strip()
+            elif line.startswith("TI  - "):
+                current_field = "title"
+                current_record[current_field] = line.replace("TI  - ", "").strip()
+            elif line.startswith("AB  - "):
+                current_field = "abstract"
+                current_record[current_field] = line.replace("AB  - ", "").strip()
+            elif line.startswith("    "):  # Continuation of the previous field
+                if current_field and current_field in current_record:
+                    current_record[current_field] += " " + line.strip()
+            else:
+                current_field = None
+
+            if line == "":
+                if current_record:
+                    parsed_data.append(current_record)
+                    current_record = {}
+        db = self.local_store
+        if not cache:
+            collection = PUBMED_TEMP_COLLECTION_NAME
+            db.remove_collection(collection, exists_ok=True)
+        logger.info(f"Inserting {len(parsed_data)} records into {collection}")
+        db.upsert(parsed_data, collection=collection, model=PUBMED_EMBEDDING_MODEL)
+        db.update_collection_metadata(collection, description=f"Special cache for pubmed searches")
+        yield from db.search(text, collection=collection, **kwargs)
+
+    def chat(
+            self,
+            query: str,
+            collection: str = None,
+            **kwargs,
+    ) -> ChatResponse:
+        """
+        Chat with pubmed.
+
+        :param query:
+        :param collection:
+        :param kwargs:
+        :return:
+        """
+        # prime the pubmed cache
+        if collection is None:
+            collection = PUBMED_COLLECTION_NAME
+        logger.info(f"Searching pubmed for {query}, kwargs={kwargs}, self={self}")
+        self.search(query, collection=collection, **kwargs)
+        chat = ChatEngine(kb_adapter=self.local_store, extractor=self.extractor)
+        response = chat.chat(query, collection=collection)
+        return response
+
+
+
+
+
+
diff --git a/src/curate_gpt/app/app.py b/src/curate_gpt/app/app.py
index 4802b61..ba5b472 100644
--- a/src/curate_gpt/app/app.py
+++ b/src/curate_gpt/app/app.py
@@ -5,18 +5,23 @@
 import yaml
 
 from curate_gpt import ChromaDBAdapter
-from curate_gpt.agents.chat import ChatEngine
+from curate_gpt.agents.chat import ChatEngine, ChatResponse
 from curate_gpt.agents.dalek import DatabaseAugmentedExtractor
+from curate_gpt.agents.pubmed import PubmedAgent
+from curate_gpt.app.helper import get_case_collection, get_applicable_examples
 from curate_gpt.extract import BasicExtractor
 
+PUBMED = "PubMed (via API)"
+
 SEARCH = "Search"
 ABOUT = "About"
 INSERT = "Insert"
-CREATE = "Synthesize"
+CREATE = "Generate"
 CHAT = "Chat"
 EXTRACT = "Extract"
 CART = "Cart"
 HELP = "Help"
+EXAMPLES = "Examples"
 
 NO_BACKGROUND_SELECTED = "No background collection"
 
@@ -26,20 +31,21 @@
 
 db = ChromaDBAdapter()
 extractor = BasicExtractor()
-agent = DatabaseAugmentedExtractor(kb_adapter=db, extractor=extractor)
-chatbot = ChatEngine(kb_adapter=db, extractor=extractor)
+
+
+
 
 st.title("CurateGPT! _alpha_")
 if not db.list_collection_names():
     st.warning("No collections found. Please use command line to load one.")
 
 # Sidebar with operation selection
-option = st.sidebar.selectbox("Choose operation", (CHAT, SEARCH, CREATE, INSERT, CART, ABOUT, HELP, ))
+option = st.sidebar.selectbox("Choose operation", (CHAT, SEARCH, CREATE, INSERT, CART, ABOUT, HELP, EXAMPLES))
 
 
 collection = st.sidebar.selectbox(
     "Choose collection",
-    list(db.list_collection_names()),
+    list(db.list_collection_names()) + [PUBMED],
     help="""
     A collection is a knowledge base. It could be anything, but
     it's likely your instance has some bio-ontologies pre-loaded.
@@ -62,7 +68,7 @@
 
 background_collection = st.sidebar.selectbox(
     "Background knowledge",
-    [NO_BACKGROUND_SELECTED] + list(db.list_collection_names()),
+    [NO_BACKGROUND_SELECTED, PUBMED] + list(db.list_collection_names()),
     help="""
     Background databases can be used to give additional context to the LLM.
     A standard pattern is to have a structured knowledge base as the main
@@ -74,6 +80,17 @@
     """
 )
 
+st.sidebar.markdown("Developed by the Monarch Initiative")
+
+
+def ask_chatbot(query) -> ChatResponse:
+    if collection == PUBMED:
+        chatbot = PubmedAgent(local_store=db, extractor=extractor)
+        return chatbot.chat(query)
+    else:
+        chatbot = ChatEngine(kb_adapter=db, extractor=extractor)
+        return chatbot.chat(query, collection=collection)
+
 
 def html_table(rows: List[dict]) -> str:
     hdr = rows[0].keys()
@@ -92,8 +109,6 @@ def html_table(rows: List[dict]) -> str:
     return html_content
 
 
-
-# Insert operation
 if option == INSERT:
     st.subheader(f"Insert new document in {collection}")
     objs = list(db.peek(collection=collection))
@@ -166,6 +181,9 @@ def html_table(rows: List[dict]) -> str:
                                for the model you selected.
                                """)
 
+    examples = get_applicable_examples(collection, CREATE)
+    st.write("Examples:")
+    st.write(f"<details>{html_table(examples)}</details>", unsafe_allow_html=True)
     extractor.model_name = model_name
 
     # Check for session state variables
@@ -175,13 +193,20 @@ def html_table(rows: List[dict]) -> str:
     if st.button(CREATE):
         if not property_query:
             property_query = "label"
+        dalek = DatabaseAugmentedExtractor(kb_adapter=db, extractor=extractor)
         if background_collection != NO_BACKGROUND_SELECTED:
-            agent.document_adapter = db
-            agent.document_adapter_collection = background_collection
+            if background_collection == PUBMED:
+                dalek.document_adapter = PubmedAgent(local_store=db, extractor=extractor)
+                dalek.collection = None
+            else:
+                dalek.document_adapter = db
+                dalek.document_adapter_collection = background_collection
         st.write(f"Generating using: **{extractor.model_name}** using *{collection}* for examples")
+        if background_collection:
+            st.write(f"Using background knowledge from: *{background_collection}*")
         rules = [instructions] if instructions else None
         st.session_state.results = [
-            agent.generate_extract(
+            dalek.generate_extract(
                 search_query,
                 #target_class="OntologyClass",
                 context_property=property_query,
@@ -225,7 +250,7 @@ def html_table(rows: List[dict]) -> str:
         if not property_query:
             property_query = "label"
         st.session_state.results = [
-            agent.generate_extract(
+            dalek.generate_extract(
                 search_query,
                 target_class="OntologyClass",
                 context_property=property_query,
@@ -263,9 +288,12 @@ def html_table(rows: List[dict]) -> str:
                                    complete results, but may also exceed context windows for the model.
                                    """)
     extractor.model_name = model_name
+    examples = get_applicable_examples(collection, CHAT)
+    st.write("Examples:")
+    st.write(f"<details>{html_table(examples)}</details>", unsafe_allow_html=True)
 
     if st.button(CHAT):
-        response = chatbot.chat(query, collection=collection)
+        response = ask_chatbot(query)
         st.markdown(response.formatted_response)
         for ref, text in response.references.items():
             st.subheader(f"Reference {ref}", anchor=f"ref-{ref}")
@@ -274,6 +302,11 @@ def html_table(rows: List[dict]) -> str:
 elif option == CART:
     st.subheader("Coming soon!")
 
+elif option == EXAMPLES:
+    cc = get_case_collection()
+    st.subheader("Examples")
+    st.code(yaml.dump(cc, sort_keys=False), language="yaml")
+
 
 elif option == ABOUT:
     st.subheader("About this instance")
@@ -289,17 +322,26 @@ def html_table(rows: List[dict]) -> str:
     st.subheader("About")
     st.write("CurateGPT is a tool for generating new entries for a knowledge base, assisted by LLMs.")
     st.write("It is a highly generic system, but it's likely the instance you are using now is configured to work with ontologies.")
+    st.subheader("Issues")
+    st.write("If you have any issues, please raise them on the [GitHub issue tracker](https://github.com/monarch-initiative/curate-gpt).")
     st.subheader("Warning!")
     st.caption("CurateGPT is pre-alpha, documentation is incomplete!")
     st.caption("If you are using a publicly deployed instance, some operations may be slow, or broken")
     st.subheader("Instructions")
     st.write("Use the sidebar to select the operation you want to perform.")
     st.write(" * Synthesize: the core operation. Generate a new entry for the selected collection.")
+    st.write(" * Chat: chat to a structured knowledge base or unstructured source.")
     st.write(" * Search: Search the vector stores.")
     st.write(" * Insert: Manually add data (do this responsibly if on public instance - no auth yet!.")
     st.write(" * About: View metadata for each instance.")
     st.subheader("FAQ")
-    st.write("### Why are there no IDs")
+    st.write("### Why are there no IDs?")
     st.write("LLMs will hallucinate IDs so we transform to CamelCase labels for demo purposes.")
     st.write("In future versions we will have a more elegant solution.")
+    st.write("### What is the PubMed collection?")
+    st.write("This is a special *virtual* collections. It is not populated ahead of time.")
+    st.write("When this is used as a source, the pubmed API is called with a relevancy search.")
+    st.write("These results are then combined with others to answer the query.")
+    st.write("### What is the 'background' collection?")
+    st.write(f"This is used only by '{CREATE}' to provide additional context.")
 
diff --git a/src/curate_gpt/app/cases.yaml b/src/curate_gpt/app/cases.yaml
new file mode 100644
index 0000000..bc159ca
--- /dev/null
+++ b/src/curate_gpt/app/cases.yaml
@@ -0,0 +1,35 @@
+cases:
+- source: cl
+  mode: CHAT
+  input: "What neurotransmitter is released by the hippocampus?"
+  domains:
+    - neuroscience
+    - cell types
+  answers:
+    - matches: glutamate
+      reference: GlutamateSecretion
+- source: cl
+  mode: CHAT
+  input: "What cells synthesize catecholamine?"
+- source: pubmed
+  mode: CHAT
+  input: "what neurons express VIP?"
+  domains:
+    - neuroscience
+    - cell types
+- source: pubmed
+  mode: CHAT
+  input: "what is the role of the hippocampus in memory?"
+- source: envo
+  mode: GENERATE
+  input: "Undersea volcano"
+  property: label
+  background: pubmed
+  domains:
+      - geology
+      - environment
+- source: obi
+  mode: GENERATE
+  input: "magnetoencephalography"
+  property: label
+  background: github_issues_obi
\ No newline at end of file
diff --git a/src/curate_gpt/app/helper.py b/src/curate_gpt/app/helper.py
new file mode 100644
index 0000000..fda2811
--- /dev/null
+++ b/src/curate_gpt/app/helper.py
@@ -0,0 +1,35 @@
+from pathlib import Path
+from typing import List, Dict, Optional
+
+import yaml
+
+HELP_CASES = Path(__file__).parent / "cases.yaml"
+
+def get_case_collection():
+    with open(HELP_CASES) as stream:
+        return yaml.safe_load(stream)
+
+def get_applicable_examples(collection: Optional[str], mode: str, relax=True) -> List[Dict]:
+    """
+    Get applicable examples for a given collection and mode.
+
+    :param collection:
+    :param mode:
+    :return:
+    """
+    if mode:
+        mode = mode.upper()
+    examples = []
+    cases = get_case_collection()["cases"]
+    for case in cases:
+        if mode and case["mode"] != mode:
+            continue
+        if collection and case["source"] not in collection:
+            # TODO: less hacky check
+            continue
+        case = {k: v for k, v in case.items() if k not in ["domains", "answers"]}
+        examples.append(case)
+    if not examples and relax and collection:
+        # If no examples are found, try to relax the collection
+        return get_applicable_examples(None, mode, relax=False)
+    return examples
diff --git a/src/curate_gpt/cli.py b/src/curate_gpt/cli.py
index 37c37cf..46e0c62 100644
--- a/src/curate_gpt/cli.py
+++ b/src/curate_gpt/cli.py
@@ -18,6 +18,7 @@
 from curate_gpt.agents.chat import ChatEngine
 
 from curate_gpt.agents.dalek import DatabaseAugmentedExtractor
+from curate_gpt.agents.pubmed import PubmedAgent
 from curate_gpt.extract.basic_extractor import BasicExtractor
 from curate_gpt.rag.openai_rag import OpenAIRAG
 from curate_gpt.store.schema_proxy import SchemaProxy
@@ -46,6 +47,17 @@
 append_option = click.option(
     "--append/--no-append", default=False, show_default=True, help="Append to the database."
 )
+object_type_option = click.option(
+    "--object-type",
+    default="Thing",
+    show_default=True,
+    help="Type of object in index.",
+)
+description_option = click.option(
+    "--description",
+    help="Description of the collection.",
+)
+
 
 
 @click.group(cls=DefaultGroup,
@@ -82,8 +94,12 @@ def main(verbose: int, quiet: bool):
 @collection_option
 @model_option
 @click.option("--text-field")
+@object_type_option
+@description_option
+@click.option("--batch-size", default=None, show_default=True, type=click.INT,
+              help="Batch size for indexing.")
 @click.argument("files", nargs=-1)
-def index(files, path, reset: bool, text_field, collection, model, **kwargs):
+def index(files, path, reset: bool, text_field, collection, model, object_type, description, batch_size, **kwargs):
     """Index files.
 
     Example:
@@ -93,8 +109,6 @@ def index(files, path, reset: bool, text_field, collection, model, **kwargs):
     """
     db = ChromaDBAdapter(path, **kwargs)
     db.text_lookup = text_field
-    if model:
-        db.model = model
     if reset:
         db.reset()
     for file in files:
@@ -106,7 +120,8 @@ def index(files, path, reset: bool, text_field, collection, model, **kwargs):
             objs = yaml.safe_load(open(file))
         if not isinstance(objs, list):
             objs = [objs]
-        db.insert(objs, collection=collection)
+        db.insert(objs, model=model, collection=collection, batch_size=batch_size)
+    db.update_collection_metadata(collection, model=model, object_type=object_type, description=description)
 
 
 @main.command(name="search")
@@ -359,6 +374,62 @@ def index_ontology_command(ont, path, reset: bool, collection, append, model, in
     db.update_collection_metadata(collection, object_type="OntologyClass")
 
 
+@main.group()
+def pubmed():
+    "Use pubmed"
+
+
+@pubmed.command(name="search")
+@collection_option
+@path_option
+@model_option
+@click.option("--expand/--no-expand", default=True, show_default=True,
+              help="Whether to expand the search term using an LLM.")
+@click.argument("query")
+def pubmed_search(query, path, model,  **kwargs):
+    pubmed = PubmedAgent()
+    db = ChromaDBAdapter(path)
+    extractor = BasicExtractor()
+    if model:
+        extractor.model_name = model
+    pubmed.extractor = extractor
+    pubmed.local_store = db
+    results = pubmed.search(query, **kwargs)
+    i = 0
+    for obj, distance, _ in results:
+        i += 1
+        print(f"## {i} DISTANCE: {distance}")
+        print(yaml.dump(obj, sort_keys=False))
+
+
+@pubmed.command(name="ask")
+@collection_option
+@path_option
+@model_option
+@limit_option
+@click.option("--show-references/--no-show-references", default=True,
+              show_default=True,
+              help="Whether to show references.")
+@click.option("--expand/--no-expand", default=True, show_default=True,
+              help="Whether to expand the search term using an LLM.")
+@click.argument("query")
+def pubmed_ask(query, path, model, show_references, **kwargs):
+    pubmed = PubmedAgent()
+    db = ChromaDBAdapter(path)
+    extractor = BasicExtractor()
+    if model:
+        extractor.model_name = model
+    pubmed.extractor = extractor
+    pubmed.local_store = db
+    response = pubmed.chat(query, **kwargs)
+    click.echo(response.formatted_response)
+    if show_references:
+        print("# References:")
+        for ref, ref_text in response.references.items():
+            print(f"## {ref}")
+            print(ref_text)
+
+
 
 if __name__ == "__main__":
     main()
diff --git a/src/curate_gpt/store/chromadb_adapter.py b/src/curate_gpt/store/chromadb_adapter.py
index dc1b899..f9361a0 100644
--- a/src/curate_gpt/store/chromadb_adapter.py
+++ b/src/curate_gpt/store/chromadb_adapter.py
@@ -165,7 +165,7 @@ def _insert_or_update(
                 objs = list(objs)
             else:
                 objs = [objs]
-        if self._is_openai(collection_obj):
+        if self._is_openai(collection_obj) and batch_size is None:
             batch_size = 100
         if text_field is None:
             text_field = self.text_lookup
@@ -205,6 +205,16 @@ def update(self, objs: Union[OBJECT, List[OBJECT]], **kwargs):
         """
         self._insert_or_update(objs, method_name="update", **kwargs)
 
+    def upsert(self, objs: Union[OBJECT, List[OBJECT]], **kwargs):
+        """
+        Update an object or list of objects in the store.
+
+        :param objs:
+        :param collection:
+        :return:
+        """
+        self._insert_or_update(objs, method_name="upsert", **kwargs)
+
     def remove_collection(self, collection: str = DEFAULT_COLLECTION, exists_ok=False, **kwargs):
         """
         Remove a collection from the database.
diff --git a/src/curate_gpt/store/db_adapter.py b/src/curate_gpt/store/db_adapter.py
index aada6d3..d95a963 100644
--- a/src/curate_gpt/store/db_adapter.py
+++ b/src/curate_gpt/store/db_adapter.py
@@ -116,7 +116,7 @@ def create_view(self, view_name: str, collection: str, expression: QUERY, **kwar
         """
         raise NotImplementedError
 
-    def remove_collection(self, collection: str = DEFAULT_COLLECTION, **kwargs):
+    def remove_collection(self, collection: str = DEFAULT_COLLECTION, exists_ok=False, **kwargs):
         """
         Remove a collection from the database.
 
@@ -152,6 +152,16 @@ def set_collection_metadata(self, collection_name: Optional[str], metadata: Coll
         """
         raise NotImplementedError
 
+    def update_collection_metadata(self, collection_name: str, **kwargs) -> CollectionMetadata:
+        """
+        Update the metadata for a collection.
+
+        :param collection_name:
+        :param kwargs:
+        :return:
+        """
+        raise NotImplementedError
+
     @abstractmethod
     def search(
         self, text: str, where: QUERY = None, collection: str = DEFAULT_COLLECTION, **kwargs