Skip to content

Commit

Permalink
add ingest chunks workflows (#1329)
Browse files Browse the repository at this point in the history
* add ingest chunks workflows

* finalize ingest chunks

* fix
  • Loading branch information
emrgnt-cmplxty authored Oct 3, 2024
1 parent fc8a5a3 commit e6d1750
Show file tree
Hide file tree
Showing 23 changed files with 584 additions and 81 deletions.
4 changes: 4 additions & 0 deletions docs/api-reference/endpoint/ingest_chunks.mdx
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
---
title: 'Ingest Chunks'
openapi: 'POST /v2/ingest_chunks'
---
1 change: 1 addition & 0 deletions docs/mint.json
Original file line number Diff line number Diff line change
Expand Up @@ -294,6 +294,7 @@
"group": "Document Ingestion",
"pages": [
"api-reference/endpoint/ingest_files",
"api-reference/endpoint/ingest_chunks",
"api-reference/endpoint/update_files"
]
},
Expand Down
8 changes: 4 additions & 4 deletions py/cli/commands/ingestion.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,13 +123,13 @@ def update_files(ctx, file_paths, document_ids, metadatas):


@cli.command()
@click.option("--v2", is_flag=True, help="use aristotle_v2.txt (a smaller file)")
@click.option(
"--v2", is_flag=True, help="use aristotle_v2.txt (a smaller file)"
)
@pass_context
def ingest_sample_file(ctx, v2=False):
"""Ingest the first sample file into R2R."""
sample_file_url = (
f"https://raw.githubusercontent.com/SciPhi-AI/R2R/main/py/core/examples/data/aristotle{'_v2' if v2 else ''}.txt"
)
sample_file_url = f"https://raw.githubusercontent.com/SciPhi-AI/R2R/main/py/core/examples/data/aristotle{'_v2' if v2 else ''}.txt"
client = ctx.obj

with timer():
Expand Down
6 changes: 4 additions & 2 deletions py/core/base/abstractions/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,9 @@
DocumentInfo,
DocumentType,
IngestionStatus,
KGExtractionStatus,
KGEnrichmentStatus,
KGExtractionStatus,
RawChunk,
)
from shared.abstractions.embedding import (
EmbeddingPurpose,
Expand Down Expand Up @@ -77,10 +78,11 @@
"Document",
"DocumentExtraction",
"DocumentInfo",
"DocumentType",
"IngestionStatus",
"KGExtractionStatus",
"KGEnrichmentStatus",
"DocumentType",
"RawChunk",
# Embedding abstractions
"EmbeddingPurpose",
"default_embedding_prefixes",
Expand Down
2 changes: 1 addition & 1 deletion py/core/main/api/auth_router.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from uuid import UUID
from typing import Optional
from uuid import UUID

from fastapi import Body, Depends, Path
from fastapi.security import OAuth2PasswordBearer, OAuth2PasswordRequestForm
Expand Down
48 changes: 48 additions & 0 deletions py/core/main/api/data/ingestion_router_openapi.yml
Original file line number Diff line number Diff line change
Expand Up @@ -59,3 +59,51 @@ update_files:
document_ids: "An optional list of document ids for each file. If not provided, the system will attempt to generate the corresponding unique from the `generate_document_id` method."
metadatas: "An optional list of JSON metadata to affix to each file"
ingestion_config: "JSON string for chunking configuration override"

ingest_chunks:
openapi_extra:
x-codeSamples:
- lang: Python
source: |
from r2r import R2RClient
client = R2RClient("http://localhost:7272")
# when using auth, do client.login(...)
result = client.ingest_chunks(
chunks=[
{
"text": "Another chunk of text",
},
{
"text": "Yet another chunk of text",
},
{
"text": "A chunk of text",
},
], )
- lang: Shell
source: |
curl -X POST "https://api.example.com/ingest_chunks" \
-H "Content-Type: application/json" \
-H "Authorization: Bearer YOUR_API_KEY" \
-d '{
"chunks": [
{
"text": "Another chunk of text"
},
{
"text": "Yet another chunk of text"
},
{
"text": "A chunk of text"
}
],
"document_id": "b4ac4dd6-5f27-596e-a55b-7cf242ca30aa",
"metadata": {}
}'
input_descriptions:
chunks: "A list of text chunks to ingest into the system."
document_id: "An optional document id to associate the chunks with. If not provided, a unique document id will be generated."
metadata: "Optional JSON metadata to associate with the ingested chunks."
68 changes: 64 additions & 4 deletions py/core/main/api/ingestion_router.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,10 @@
from uuid import UUID

import yaml
from fastapi import Depends, File, Form, UploadFile
from fastapi import Body, Depends, File, Form, UploadFile
from pydantic import Json

from core.base import R2RException, generate_document_id
from core.base import R2RException, RawChunk, generate_document_id
from core.base.api.models import (
WrappedIngestionResponse,
WrappedUpdateResponse,
Expand Down Expand Up @@ -38,12 +38,17 @@ def _register_workflows(self):
self.service,
{
"ingest-files": (
"Ingestion task queued successfully."
"Ingest files task queued successfully."
if self.orchestration_provider.config.provider != "simple"
else "Ingestion task completed successfully."
),
"ingest-chunks": (
"Ingest chunks task queued successfully."
if self.orchestration_provider.config.provider != "simple"
else "Ingestion task completed successfully."
),
"update-files": (
"Update task queued successfully."
"Update file task queued successfully."
if self.orchestration_provider.config.provider != "simple"
else "Update task queued successfully."
),
Expand Down Expand Up @@ -96,6 +101,7 @@ async def ingest_files_app(
A valid user authentication token is required to access this endpoint, as regular users can only ingest files for their own access. More expansive collection permissioning is under development.
"""
self._validate_ingestion_config(ingestion_config)

# Check if the user is a superuser
if not auth_user.is_superuser:
Expand Down Expand Up @@ -253,6 +259,60 @@ async def update_files_app(
raw_message["document_ids"] = workflow_input["document_ids"]
return raw_message

ingest_chunks_extras = self.openapi_extras.get("ingest_chunks", {})
ingest_chunks_descriptions = ingest_chunks_extras.get(
"input_descriptions", {}
)

@self.router.post(
"/ingest_chunks",
openapi_extra=ingest_chunks_extras.get("openapi_extra"),
)
@self.base_endpoint
async def ingest_chunks_app(
chunks: Json[list[RawChunk]] = Body(
{}, description=ingest_chunks_descriptions.get("chunks")
),
document_id: Optional[UUID] = Body(
None, description=ingest_chunks_descriptions.get("document_id")
),
metadata: Optional[Json[dict]] = Body(
None, description=ingest_files_descriptions.get("metadata")
),
auth_user=Depends(self.service.providers.auth.auth_wrapper),
response_model=WrappedIngestionResponse,
):
"""
Ingest text chunks into the system.
This endpoint supports multipart/form-data requests, enabling you to ingest pre-parsed text chunks into R2R.
A valid user authentication token is required to access this endpoint, as regular users can only ingest chunks for their own access. More expansive collection permissioning is under development.
"""
if not document_id:
document_id = generate_document_id(
chunks[0].text[:20], auth_user.id
)

workflow_input = {
"document_id": str(document_id),
"chunks": [chunk.model_dump() for chunk in chunks],
"metadata": metadata or {},
"user": auth_user.model_dump_json(),
}

raw_message = await self.orchestration_provider.run_workflow(
"ingest-chunks",
{"request": workflow_input},
options={
"additional_metadata": {
"document_id": str(document_id),
}
},
)
raw_message["document_id"] = str(document_id)
return raw_message

@staticmethod
def _validate_ingestion_config(ingestion_config):
from ..assembly.factory import R2RProviderFactory
Expand Down
4 changes: 3 additions & 1 deletion py/core/main/api/kg_router.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,9 @@ async def enrich_graph(

if run_type is KGRunType.ESTIMATE:

return await self.service.get_enrichment_estimate(collection_id, server_kg_enrichment_settings)
return await self.service.get_enrichment_estimate(
collection_id, server_kg_enrichment_settings
)

if kg_enrichment_settings:
for key, value in kg_enrichment_settings.items():
Expand Down
Loading

0 comments on commit e6d1750

Please sign in to comment.