From 55b85024395a956e928c7fffb5fa2bae2e57ba82 Mon Sep 17 00:00:00 2001 From: cmungall Date: Fri, 11 Aug 2023 12:34:12 -0700 Subject: [PATCH] Adding pubmed --- Makefile | 6 + adhoc.Makefile | 6 +- poetry.lock | 2 +- pyproject.toml | 1 + src/curate_gpt/agents/chat.py | 1 + src/curate_gpt/agents/pubmed.py | 179 +++++++++++++++++++++++ src/curate_gpt/app/app.py | 72 +++++++-- src/curate_gpt/app/cases.yaml | 35 +++++ src/curate_gpt/app/helper.py | 35 +++++ src/curate_gpt/cli.py | 79 +++++++++- src/curate_gpt/store/chromadb_adapter.py | 12 +- src/curate_gpt/store/db_adapter.py | 12 +- 12 files changed, 417 insertions(+), 23 deletions(-) create mode 100644 src/curate_gpt/agents/pubmed.py create mode 100644 src/curate_gpt/app/cases.yaml create mode 100644 src/curate_gpt/app/helper.py diff --git a/Makefile b/Makefile index 0b3fbef..0d3e6ff 100644 --- a/Makefile +++ b/Makefile @@ -8,10 +8,16 @@ ONTS = cl uberon obi go envo hp mp mondo po all: index_all_oai index_all_oai: $(patsubst %,terms-oai-%,$(ONTS)) +index_all2_oai: $(patsubst %,terms_defns-oai-%,$(ONTS)) terms-oai-%: $(CURATE) ontology index -p $(DB_PATH) -c terms_$*_oai -m openai: sqlite:obo:$* + +terms_defns-oai-%: + $(CURATE) ontology index --index-fields label,definition,relationships -p $(DB_PATH) -c terms_defns_$*_oai -m openai: sqlite:obo:$* + + terms-default-%: $(CURATE) ontology index -p $(DB_PATH) -c terms_$* sqlite:obo:$* diff --git a/adhoc.Makefile b/adhoc.Makefile index 066fd0f..3b824d2 100644 --- a/adhoc.Makefile +++ b/adhoc.Makefile @@ -1,8 +1,12 @@ RUN = poetry run +DB_PATH = db data/nmdc.json: $(RUN) python -m curate_gpt.adhoc.nmdc_sample_downloader --no-stream --format json > $@ index-nmdc: data/nmdc.json - $(RUN) curategpt -v index -c nmdc $< + $(RUN) curategpt -v index -p $(DB_PATH) -m openai: -c biosamples_nmdc_oai --object-type Biosample --description "Samples taken from NMDC database" $< + +index-obi-issues: + $(RUN) curategpt index -c github_issues_obi_oai -m openai: ../formal-ontology-analysis/repo-dirs/metadata/*.json diff --git a/poetry.lock b/poetry.lock index 197657a..331fd06 100644 --- a/poetry.lock +++ b/poetry.lock @@ -7520,4 +7520,4 @@ docs = [] [metadata] lock-version = "2.0" python-versions = "^3.9, !=3.9.7" -content-hash = "19f03da820d4a50752a975b58c9c9fa3ab9a59a8e88cd2b14e72d206c7bad544" +content-hash = "1e27f26ce2f8bd5e9df25ddbab685d0948b9e9d63d6c66899a225a3e68b9e726" diff --git a/pyproject.toml b/pyproject.toml index 127124d..e45fa5b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,6 +27,7 @@ replicate = "^0.11.0" sqlite-utils = "^3.34" gpt4all = "^1.0.8" httpx = "^0.24.1" +eutils = "^0.6.0" [tool.poetry.dev-dependencies] pytest = ">=7.1.2" diff --git a/src/curate_gpt/agents/chat.py b/src/curate_gpt/agents/chat.py index 98f6817..67d1673 100644 --- a/src/curate_gpt/agents/chat.py +++ b/src/curate_gpt/agents/chat.py @@ -67,6 +67,7 @@ def chat( texts = [] i = 0 references = {} + logger.info(f"Chat: {query} on {self.kb_adapter} kwargs: {kwargs}") for obj, _, obj_meta in self.kb_adapter.search( query, relevance_factor=self.relevance_factor, **kwargs ): diff --git a/src/curate_gpt/agents/pubmed.py b/src/curate_gpt/agents/pubmed.py new file mode 100644 index 0000000..96bb792 --- /dev/null +++ b/src/curate_gpt/agents/pubmed.py @@ -0,0 +1,179 @@ +"""Chat with a KB.""" +import json +import logging +import re +import time +from collections import defaultdict +from dataclasses import dataclass +from typing import Any, Dict, List, ClassVar, Optional, Iterator + +import requests +import yaml +from eutils import Client +from pydantic import BaseModel + +from curate_gpt.agents.chat import ChatEngine, ChatResponse +from curate_gpt.extract import AnnotatedObject, Extractor +from curate_gpt.store import DBAdapter +from curate_gpt.store.db_adapter import SEARCH_RESULT + +logger = logging.getLogger(__name__) + +ESEARCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi" +EFETCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi" + +PUBMED_COLLECTION_NAME = "pubmed_subset" +PUBMED_TEMP_COLLECTION_NAME = "__pubmed_temp__" +PUBMED_EMBEDDING_MODEL = "openai:" + + +@dataclass +class PubmedAgent: + """ + An agent to pull from pubmed. + + TODO: make this a virtual store + """ + + local_store: DBAdapter = None + """Adapter to local knowledge base""" + + eutils_client: Client = None + + extractor: Extractor = None + + def search( + self, + text: str, + collection: str = None, + cache: bool = True, + expand: bool = True, + **kwargs, + ) -> Iterator[SEARCH_RESULT]: + """ + Extract structured object using text seed and background knowledge. + + :param text: + :param kwargs: + :return: + """ + if collection is None: + collection = PUBMED_COLLECTION_NAME + logger.info(f"Searching for {text}, caching in {collection}") + if expand: + logger.info(f"Expanding search term: {text} to create pubmed query") + model = self.extractor.model + response = model.prompt(text, system="generate a semi-colon separated list of the most relevant terms") + terms = response.text().split(";") + search_term = " OR ".join(terms) + else: + search_term = text + logger.info(f"Constructed search term: {search_term}") + # Parameters for the request + params = { + "db": "pubmed", + "term": search_term, + "retmax": 100, + "sort": "relevance", + "retmode": "json" + } + + time.sleep(0.5) + response = requests.get(ESEARCH_URL, params=params) + data = response.json() + + # Extract PubMed IDs from the response + pubmed_ids = data["esearchresult"]["idlist"] + + if not pubmed_ids: + logger.warning(f"No results with {search_term}") + if expand: + logger.info(f"Trying again without expansion") + return self.search(text, collection=collection, cache=cache, expand=False, **kwargs) + else: + logger.error(f"Failed to find results for {text}") + return + + logger.info(f"Found {len(pubmed_ids)} results: {pubmed_ids}") + + efetch_params = { + "db": "pubmed", + "id": ",".join(pubmed_ids), # Combine PubMed IDs into a comma-separated string + "retmode": "json" + } + + # Parameters for the efetch request + efetch_params = { + "db": "pubmed", + "id": ",".join(pubmed_ids), # Combine PubMed IDs into a comma-separated string + "rettype": "medline", + "retmode": "text" + } + efetch_response = requests.get(EFETCH_URL, params=efetch_params) + medline_records = efetch_response.text + + # Parsing titles and abstracts from the MEDLINE records + parsed_data = [] + current_record = {} + current_field = None + + for line in medline_records.split("\n"): + if line.startswith("PMID- "): + current_field = "id" + current_record[current_field] = "PMID:" + line.replace("PMID- ", "").strip() + if line.startswith("PMC - "): + current_field = "pmcid" + current_record[current_field] = "PMCID:" + line.replace("PMC - ", "").strip() + elif line.startswith("TI - "): + current_field = "title" + current_record[current_field] = line.replace("TI - ", "").strip() + elif line.startswith("AB - "): + current_field = "abstract" + current_record[current_field] = line.replace("AB - ", "").strip() + elif line.startswith(" "): # Continuation of the previous field + if current_field and current_field in current_record: + current_record[current_field] += " " + line.strip() + else: + current_field = None + + if line == "": + if current_record: + parsed_data.append(current_record) + current_record = {} + db = self.local_store + if not cache: + collection = PUBMED_TEMP_COLLECTION_NAME + db.remove_collection(collection, exists_ok=True) + logger.info(f"Inserting {len(parsed_data)} records into {collection}") + db.upsert(parsed_data, collection=collection, model=PUBMED_EMBEDDING_MODEL) + db.update_collection_metadata(collection, description=f"Special cache for pubmed searches") + yield from db.search(text, collection=collection, **kwargs) + + def chat( + self, + query: str, + collection: str = None, + **kwargs, + ) -> ChatResponse: + """ + Chat with pubmed. + + :param query: + :param collection: + :param kwargs: + :return: + """ + # prime the pubmed cache + if collection is None: + collection = PUBMED_COLLECTION_NAME + logger.info(f"Searching pubmed for {query}, kwargs={kwargs}, self={self}") + self.search(query, collection=collection, **kwargs) + chat = ChatEngine(kb_adapter=self.local_store, extractor=self.extractor) + response = chat.chat(query, collection=collection) + return response + + + + + + diff --git a/src/curate_gpt/app/app.py b/src/curate_gpt/app/app.py index 4802b61..ba5b472 100644 --- a/src/curate_gpt/app/app.py +++ b/src/curate_gpt/app/app.py @@ -5,18 +5,23 @@ import yaml from curate_gpt import ChromaDBAdapter -from curate_gpt.agents.chat import ChatEngine +from curate_gpt.agents.chat import ChatEngine, ChatResponse from curate_gpt.agents.dalek import DatabaseAugmentedExtractor +from curate_gpt.agents.pubmed import PubmedAgent +from curate_gpt.app.helper import get_case_collection, get_applicable_examples from curate_gpt.extract import BasicExtractor +PUBMED = "PubMed (via API)" + SEARCH = "Search" ABOUT = "About" INSERT = "Insert" -CREATE = "Synthesize" +CREATE = "Generate" CHAT = "Chat" EXTRACT = "Extract" CART = "Cart" HELP = "Help" +EXAMPLES = "Examples" NO_BACKGROUND_SELECTED = "No background collection" @@ -26,20 +31,21 @@ db = ChromaDBAdapter() extractor = BasicExtractor() -agent = DatabaseAugmentedExtractor(kb_adapter=db, extractor=extractor) -chatbot = ChatEngine(kb_adapter=db, extractor=extractor) + + + st.title("CurateGPT! _alpha_") if not db.list_collection_names(): st.warning("No collections found. Please use command line to load one.") # Sidebar with operation selection -option = st.sidebar.selectbox("Choose operation", (CHAT, SEARCH, CREATE, INSERT, CART, ABOUT, HELP, )) +option = st.sidebar.selectbox("Choose operation", (CHAT, SEARCH, CREATE, INSERT, CART, ABOUT, HELP, EXAMPLES)) collection = st.sidebar.selectbox( "Choose collection", - list(db.list_collection_names()), + list(db.list_collection_names()) + [PUBMED], help=""" A collection is a knowledge base. It could be anything, but it's likely your instance has some bio-ontologies pre-loaded. @@ -62,7 +68,7 @@ background_collection = st.sidebar.selectbox( "Background knowledge", - [NO_BACKGROUND_SELECTED] + list(db.list_collection_names()), + [NO_BACKGROUND_SELECTED, PUBMED] + list(db.list_collection_names()), help=""" Background databases can be used to give additional context to the LLM. A standard pattern is to have a structured knowledge base as the main @@ -74,6 +80,17 @@ """ ) +st.sidebar.markdown("Developed by the Monarch Initiative") + + +def ask_chatbot(query) -> ChatResponse: + if collection == PUBMED: + chatbot = PubmedAgent(local_store=db, extractor=extractor) + return chatbot.chat(query) + else: + chatbot = ChatEngine(kb_adapter=db, extractor=extractor) + return chatbot.chat(query, collection=collection) + def html_table(rows: List[dict]) -> str: hdr = rows[0].keys() @@ -92,8 +109,6 @@ def html_table(rows: List[dict]) -> str: return html_content - -# Insert operation if option == INSERT: st.subheader(f"Insert new document in {collection}") objs = list(db.peek(collection=collection)) @@ -166,6 +181,9 @@ def html_table(rows: List[dict]) -> str: for the model you selected. """) + examples = get_applicable_examples(collection, CREATE) + st.write("Examples:") + st.write(f"
{html_table(examples)}
", unsafe_allow_html=True) extractor.model_name = model_name # Check for session state variables @@ -175,13 +193,20 @@ def html_table(rows: List[dict]) -> str: if st.button(CREATE): if not property_query: property_query = "label" + dalek = DatabaseAugmentedExtractor(kb_adapter=db, extractor=extractor) if background_collection != NO_BACKGROUND_SELECTED: - agent.document_adapter = db - agent.document_adapter_collection = background_collection + if background_collection == PUBMED: + dalek.document_adapter = PubmedAgent(local_store=db, extractor=extractor) + dalek.collection = None + else: + dalek.document_adapter = db + dalek.document_adapter_collection = background_collection st.write(f"Generating using: **{extractor.model_name}** using *{collection}* for examples") + if background_collection: + st.write(f"Using background knowledge from: *{background_collection}*") rules = [instructions] if instructions else None st.session_state.results = [ - agent.generate_extract( + dalek.generate_extract( search_query, #target_class="OntologyClass", context_property=property_query, @@ -225,7 +250,7 @@ def html_table(rows: List[dict]) -> str: if not property_query: property_query = "label" st.session_state.results = [ - agent.generate_extract( + dalek.generate_extract( search_query, target_class="OntologyClass", context_property=property_query, @@ -263,9 +288,12 @@ def html_table(rows: List[dict]) -> str: complete results, but may also exceed context windows for the model. """) extractor.model_name = model_name + examples = get_applicable_examples(collection, CHAT) + st.write("Examples:") + st.write(f"
{html_table(examples)}
", unsafe_allow_html=True) if st.button(CHAT): - response = chatbot.chat(query, collection=collection) + response = ask_chatbot(query) st.markdown(response.formatted_response) for ref, text in response.references.items(): st.subheader(f"Reference {ref}", anchor=f"ref-{ref}") @@ -274,6 +302,11 @@ def html_table(rows: List[dict]) -> str: elif option == CART: st.subheader("Coming soon!") +elif option == EXAMPLES: + cc = get_case_collection() + st.subheader("Examples") + st.code(yaml.dump(cc, sort_keys=False), language="yaml") + elif option == ABOUT: st.subheader("About this instance") @@ -289,17 +322,26 @@ def html_table(rows: List[dict]) -> str: st.subheader("About") st.write("CurateGPT is a tool for generating new entries for a knowledge base, assisted by LLMs.") st.write("It is a highly generic system, but it's likely the instance you are using now is configured to work with ontologies.") + st.subheader("Issues") + st.write("If you have any issues, please raise them on the [GitHub issue tracker](https://github.com/monarch-initiative/curate-gpt).") st.subheader("Warning!") st.caption("CurateGPT is pre-alpha, documentation is incomplete!") st.caption("If you are using a publicly deployed instance, some operations may be slow, or broken") st.subheader("Instructions") st.write("Use the sidebar to select the operation you want to perform.") st.write(" * Synthesize: the core operation. Generate a new entry for the selected collection.") + st.write(" * Chat: chat to a structured knowledge base or unstructured source.") st.write(" * Search: Search the vector stores.") st.write(" * Insert: Manually add data (do this responsibly if on public instance - no auth yet!.") st.write(" * About: View metadata for each instance.") st.subheader("FAQ") - st.write("### Why are there no IDs") + st.write("### Why are there no IDs?") st.write("LLMs will hallucinate IDs so we transform to CamelCase labels for demo purposes.") st.write("In future versions we will have a more elegant solution.") + st.write("### What is the PubMed collection?") + st.write("This is a special *virtual* collections. It is not populated ahead of time.") + st.write("When this is used as a source, the pubmed API is called with a relevancy search.") + st.write("These results are then combined with others to answer the query.") + st.write("### What is the 'background' collection?") + st.write(f"This is used only by '{CREATE}' to provide additional context.") diff --git a/src/curate_gpt/app/cases.yaml b/src/curate_gpt/app/cases.yaml new file mode 100644 index 0000000..bc159ca --- /dev/null +++ b/src/curate_gpt/app/cases.yaml @@ -0,0 +1,35 @@ +cases: +- source: cl + mode: CHAT + input: "What neurotransmitter is released by the hippocampus?" + domains: + - neuroscience + - cell types + answers: + - matches: glutamate + reference: GlutamateSecretion +- source: cl + mode: CHAT + input: "What cells synthesize catecholamine?" +- source: pubmed + mode: CHAT + input: "what neurons express VIP?" + domains: + - neuroscience + - cell types +- source: pubmed + mode: CHAT + input: "what is the role of the hippocampus in memory?" +- source: envo + mode: GENERATE + input: "Undersea volcano" + property: label + background: pubmed + domains: + - geology + - environment +- source: obi + mode: GENERATE + input: "magnetoencephalography" + property: label + background: github_issues_obi \ No newline at end of file diff --git a/src/curate_gpt/app/helper.py b/src/curate_gpt/app/helper.py new file mode 100644 index 0000000..fda2811 --- /dev/null +++ b/src/curate_gpt/app/helper.py @@ -0,0 +1,35 @@ +from pathlib import Path +from typing import List, Dict, Optional + +import yaml + +HELP_CASES = Path(__file__).parent / "cases.yaml" + +def get_case_collection(): + with open(HELP_CASES) as stream: + return yaml.safe_load(stream) + +def get_applicable_examples(collection: Optional[str], mode: str, relax=True) -> List[Dict]: + """ + Get applicable examples for a given collection and mode. + + :param collection: + :param mode: + :return: + """ + if mode: + mode = mode.upper() + examples = [] + cases = get_case_collection()["cases"] + for case in cases: + if mode and case["mode"] != mode: + continue + if collection and case["source"] not in collection: + # TODO: less hacky check + continue + case = {k: v for k, v in case.items() if k not in ["domains", "answers"]} + examples.append(case) + if not examples and relax and collection: + # If no examples are found, try to relax the collection + return get_applicable_examples(None, mode, relax=False) + return examples diff --git a/src/curate_gpt/cli.py b/src/curate_gpt/cli.py index 37c37cf..46e0c62 100644 --- a/src/curate_gpt/cli.py +++ b/src/curate_gpt/cli.py @@ -18,6 +18,7 @@ from curate_gpt.agents.chat import ChatEngine from curate_gpt.agents.dalek import DatabaseAugmentedExtractor +from curate_gpt.agents.pubmed import PubmedAgent from curate_gpt.extract.basic_extractor import BasicExtractor from curate_gpt.rag.openai_rag import OpenAIRAG from curate_gpt.store.schema_proxy import SchemaProxy @@ -46,6 +47,17 @@ append_option = click.option( "--append/--no-append", default=False, show_default=True, help="Append to the database." ) +object_type_option = click.option( + "--object-type", + default="Thing", + show_default=True, + help="Type of object in index.", +) +description_option = click.option( + "--description", + help="Description of the collection.", +) + @click.group(cls=DefaultGroup, @@ -82,8 +94,12 @@ def main(verbose: int, quiet: bool): @collection_option @model_option @click.option("--text-field") +@object_type_option +@description_option +@click.option("--batch-size", default=None, show_default=True, type=click.INT, + help="Batch size for indexing.") @click.argument("files", nargs=-1) -def index(files, path, reset: bool, text_field, collection, model, **kwargs): +def index(files, path, reset: bool, text_field, collection, model, object_type, description, batch_size, **kwargs): """Index files. Example: @@ -93,8 +109,6 @@ def index(files, path, reset: bool, text_field, collection, model, **kwargs): """ db = ChromaDBAdapter(path, **kwargs) db.text_lookup = text_field - if model: - db.model = model if reset: db.reset() for file in files: @@ -106,7 +120,8 @@ def index(files, path, reset: bool, text_field, collection, model, **kwargs): objs = yaml.safe_load(open(file)) if not isinstance(objs, list): objs = [objs] - db.insert(objs, collection=collection) + db.insert(objs, model=model, collection=collection, batch_size=batch_size) + db.update_collection_metadata(collection, model=model, object_type=object_type, description=description) @main.command(name="search") @@ -359,6 +374,62 @@ def index_ontology_command(ont, path, reset: bool, collection, append, model, in db.update_collection_metadata(collection, object_type="OntologyClass") +@main.group() +def pubmed(): + "Use pubmed" + + +@pubmed.command(name="search") +@collection_option +@path_option +@model_option +@click.option("--expand/--no-expand", default=True, show_default=True, + help="Whether to expand the search term using an LLM.") +@click.argument("query") +def pubmed_search(query, path, model, **kwargs): + pubmed = PubmedAgent() + db = ChromaDBAdapter(path) + extractor = BasicExtractor() + if model: + extractor.model_name = model + pubmed.extractor = extractor + pubmed.local_store = db + results = pubmed.search(query, **kwargs) + i = 0 + for obj, distance, _ in results: + i += 1 + print(f"## {i} DISTANCE: {distance}") + print(yaml.dump(obj, sort_keys=False)) + + +@pubmed.command(name="ask") +@collection_option +@path_option +@model_option +@limit_option +@click.option("--show-references/--no-show-references", default=True, + show_default=True, + help="Whether to show references.") +@click.option("--expand/--no-expand", default=True, show_default=True, + help="Whether to expand the search term using an LLM.") +@click.argument("query") +def pubmed_ask(query, path, model, show_references, **kwargs): + pubmed = PubmedAgent() + db = ChromaDBAdapter(path) + extractor = BasicExtractor() + if model: + extractor.model_name = model + pubmed.extractor = extractor + pubmed.local_store = db + response = pubmed.chat(query, **kwargs) + click.echo(response.formatted_response) + if show_references: + print("# References:") + for ref, ref_text in response.references.items(): + print(f"## {ref}") + print(ref_text) + + if __name__ == "__main__": main() diff --git a/src/curate_gpt/store/chromadb_adapter.py b/src/curate_gpt/store/chromadb_adapter.py index dc1b899..f9361a0 100644 --- a/src/curate_gpt/store/chromadb_adapter.py +++ b/src/curate_gpt/store/chromadb_adapter.py @@ -165,7 +165,7 @@ def _insert_or_update( objs = list(objs) else: objs = [objs] - if self._is_openai(collection_obj): + if self._is_openai(collection_obj) and batch_size is None: batch_size = 100 if text_field is None: text_field = self.text_lookup @@ -205,6 +205,16 @@ def update(self, objs: Union[OBJECT, List[OBJECT]], **kwargs): """ self._insert_or_update(objs, method_name="update", **kwargs) + def upsert(self, objs: Union[OBJECT, List[OBJECT]], **kwargs): + """ + Update an object or list of objects in the store. + + :param objs: + :param collection: + :return: + """ + self._insert_or_update(objs, method_name="upsert", **kwargs) + def remove_collection(self, collection: str = DEFAULT_COLLECTION, exists_ok=False, **kwargs): """ Remove a collection from the database. diff --git a/src/curate_gpt/store/db_adapter.py b/src/curate_gpt/store/db_adapter.py index aada6d3..d95a963 100644 --- a/src/curate_gpt/store/db_adapter.py +++ b/src/curate_gpt/store/db_adapter.py @@ -116,7 +116,7 @@ def create_view(self, view_name: str, collection: str, expression: QUERY, **kwar """ raise NotImplementedError - def remove_collection(self, collection: str = DEFAULT_COLLECTION, **kwargs): + def remove_collection(self, collection: str = DEFAULT_COLLECTION, exists_ok=False, **kwargs): """ Remove a collection from the database. @@ -152,6 +152,16 @@ def set_collection_metadata(self, collection_name: Optional[str], metadata: Coll """ raise NotImplementedError + def update_collection_metadata(self, collection_name: str, **kwargs) -> CollectionMetadata: + """ + Update the metadata for a collection. + + :param collection_name: + :param kwargs: + :return: + """ + raise NotImplementedError + @abstractmethod def search( self, text: str, where: QUERY = None, collection: str = DEFAULT_COLLECTION, **kwargs