SciPhi-AI · emrgnt-cmplxty · Oct 5, 2024 · Oct 4, 2024 · Oct 4, 2024 · Oct 4, 2024
diff --git a/.github/workflows/integration-test-workflow-debian.yml b/.github/workflows/integration-test-workflow-debian.yml
@@ -1,10 +1,10 @@
-name: R2R CLI Integration Test (Debian GNU/Linux 12 (bookworm) amd64)
+name: R2R CLI Integration and Regression Test
 
 on:
   push:
     branches:
-      - '**'
-  workflow_dispatch:
+      - '**'  # Trigger on all branches
+  workflow_dispatch:  # Allow manual trigger
 
 jobs:
   build-and-test:
@@ -16,29 +16,57 @@ jobs:
     env:
       OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
       TELEMETRY_ENABLED: false
-      POSTGRES_USER: ${{ secrets.POSTGRES_USER }}
-      POSTGRES_PASSWORD: ${{ secrets.POSTGRES_PASSWORD }}
-      POSTGRES_DBNAME: ${{ secrets.POSTGRES_DBNAME }}
-      POSTGRES_HOST: ${{ secrets.POSTGRES_HOST }}
-      POSTGRES_PORT: ${{ secrets.POSTGRES_PORT }}
-      R2R_PROJECT_NAME: ${{ secrets.R2R_PROJECT_NAME }}
+      POSTGRES_HOST: localhost
+      POSTGRES_DBNAME: postgres
+      POSTGRES_PORT: 5432
+      POSTGRES_PASSWORD: postgres
+      POSTGRES_USER: postgres
+      R2R_PROJECT_NAME: r2r_default
 
     steps:
-    - uses: actions/checkout@v4
+    - name: Checkout code
+      uses: actions/checkout@v4
 
-    - name: Set up Python
+    - name: Set up Python environment
       uses: actions/setup-python@v4
       with:
-        python-version: '3.x'
+        python-version: '3.10'  # Use a stable Python version
 
-    - name: Install Poetry
+    - name: Install Poetry and dependencies
       run: |
         curl -sSL https://install.python-poetry.org | python3 -
+        cd py && poetry install -E core -E ingestion-bundle
 
-    - name: Install dependencies
-      working-directory: ./py
+    - name: Remove pre-installed PostgreSQL
+      run: |
+        sudo apt-get purge -y 'postgresql-*'
+        sudo rm -rf /var/lib/postgresql
+        sudo rm -rf /var/log/postgresql
+        sudo rm -rf /etc/postgresql
+
+    - name: Add PostgreSQL Apt Repository
+      run: |
+        # Add the PostgreSQL Apt repository
+        echo "deb [signed-by=/usr/share/keyrings/postgresql-archive-keyring.gpg] http://apt.postgresql.org/pub/repos/apt $(lsb_release -cs)-pgdg main" | sudo tee /etc/apt/sources.list.d/pgdg.list
+        # Download and add the repository GPG key
+        wget --quiet -O - https://www.postgresql.org/media/keys/ACCC4CF8.asc | sudo gpg --dearmor -o /usr/share/keyrings/postgresql-archive-keyring.gpg
+
+    - name: Install PostgreSQL 15 and pgvector
+      run: |
+        sudo apt-get update
+        sudo apt-get install -y postgresql-15 postgresql-client-15 postgresql-15-pgvector
+
+    - name: Start PostgreSQL 15 service
+      run: |
+        sudo systemctl enable postgresql@15-main
+        sudo systemctl start postgresql@15-main
+
+    - name: Configure PostgreSQL
       run: |
-        poetry install -E core -E ingestion-bundle
+        # Change to a directory accessible by the postgres user to avoid permission warnings
+        cd /
+        sudo -u postgres /usr/lib/postgresql/15/bin/psql -c "ALTER USER postgres PASSWORD 'postgres';"
+        sudo -u postgres /usr/lib/postgresql/15/bin/psql -c "CREATE EXTENSION vector;"
 
     - name: Start R2R server
       working-directory: ./py
@@ -47,102 +75,43 @@ jobs:
         echo "Waiting for services to start..."
         sleep 30
 
-    - name: Run integration tests
+    - name: Run CLI Ingestion
       working-directory: ./py
       run: |
-        echo "R2R Version"
-        poetry run r2r version
+        poetry run python tests/integration/harness_cli.py test_ingest_sample_file_cli
+        poetry run python tests/integration/harness_cli.py test_document_overview_sample_file_cli
+        poetry run python tests/integration/harness_cli.py test_document_chunks_sample_file_cli
 
-    - name: Walkthrough
+    - name: Run CLI Retrieval
       working-directory: ./py
       run: |
-        echo "Ingest Data"
-        poetry run r2r ingest-sample-files
-
-        echo "Get Documents Overview"
-        poetry run r2r documents-overview
-
-        echo "Get Document Chunks"
-        poetry run r2r document-chunks --document-id=9fbe403b-c11c-5aae-8ade-ef22980c3ad1
-
-        echo "Delete Documents"
-        poetry run r2r delete --filter=document_id:eq:9fbe403b-c11c-5aae-8ade-ef22980c3ad1
-
-        echo "Update Document"
-        poetry run r2r update-files core/examples/data/aristotle_v2.txt --document-ids=9fbe403b-c11c-5aae-8ade-ef22980c3ad1
-
-        echo "Vector Search"
-        poetry run r2r search --query="What was Uber's profit in 2020?"
-
-        echo "Hybrid Search"
-        r2r search --query="What was Uber's profit in 2020?" --use-hybrid-search
-
-        echo "Basic RAG"
-        poetry run r2r rag --query="What was Uber's profit in 2020?"
-
-        echo "RAG with Hybrid Search"
-        poetry run r2r rag --query="Who is Jon Snow?" --use-hybrid-search
-
-        echo "Streaming RAG"
-        poetry run r2r rag --query="who was aristotle" --use-hybrid-search --stream
+        poetry run python tests/integration/harness_cli.py test_vector_search_sample_file_filter_cli
+        poetry run python tests/integration/harness_cli.py test_rag_response_sample_file_cli
+        poetry run python tests/integration/harness_cli.py test_rag_response_stream_sample_file_cli
 
-        echo "User Registration"
-        curl -X POST http://localhost:7272/v2/register \
-          -H "Content-Type: application/json" \
-          -d '{
-            "email": "[email protected]",
-            "password": "password123"
-          }'
-
-        echo "User Login"
-        curl -X POST http://localhost:7272/v2/login \
-        -H "Content-Type: application/x-www-form-urlencoded" \
-        -d "[email protected]&password=password123"
-
-        echo "Users Overview"
-        poetry run r2r users-overview
-
-        echo "Logging"
-        poetry run r2r logs
-
-        echo "Analytics"
-        poetry run r2r analytics --filters '{"search_latencies": "search_latency"}' --analysis-types '{"search_latencies": ["basic_statistics", "search_latency"]}'
-
-    - name: GraphRAG
+    - name: Run SDK Integration
       working-directory: ./py
       run: |
-        echo "Create Knowledge Graph"
-        poetry run r2r create-graph --document-ids=9fbe403b-c11c-5aae-8ade-ef22980c3ad1
-
-        echo "Inspect Knowledge Graph"
-        poetry run r2r inspect-knowledge-graph
-
-        echo "Graph Enrichment"
-        poetry run r2r enrich-graph
-
-        echo "Local Search"
-        r2r search --query="Who is Aristotle?" --use-kg-search --kg-search-type=local
-
-        echo "Global Search"
-        r2r search --query="What were Aristotles key contributions to philosophy?" --use-kg-search --kg-search-type=global --max-llm-queries-for-global-search=100
+        poetry run python tests/integration/harness_sdk.py test_ingest_sample_file_sdk
+        poetry run python tests/integration/harness_sdk.py test_reingest_sample_file_sdk
+        poetry run python tests/integration/harness_sdk.py test_document_overview_sample_file_sdk
 
-        echo "RAG"
-        r2r rag --query="What are the key contributions of Aristotle to modern society?" --use-kg-search --kg-search-type=global --max-llm-queries-for-global-search=100
-
-
-
-
-
-
-    - name: Advanced RAG
+    - name: Run SDK Retrieval
       working-directory: ./py
       run: |
-        echo "HyDE"
-        poetry run r2r rag --query="who was aristotle" --use-hybrid-search --stream --search-strategy=hyde
-
-        echo "Rag-Fusion"
-        r2r rag --query="Explain the theory of relativity" --use-hybrid-search --stream --search-strategy=rag_fusion
+        poetry run python tests/integration/harness_sdk.py test_vector_search_sample_file_filter_sdk
+        poetry run python tests/integration/harness_sdk.py test_hybrid_search_sample_file_filter_sdk
+        poetry run python tests/integration/harness_sdk.py test_rag_response_sample_file_sdk
+        poetry run python tests/integration/harness_sdk.py test_rag_response_stream_sample_file_sdk
 
     - name: Stop R2R server
+      if: always()
+      run: ps aux | grep "r2r serve" | awk '{print $2}' | xargs kill || true
+
+    - name: Uninstall PostgreSQL after tests (Optional)
+      if: always()
       run: |
-        pkill -f "r2r serve"
+        sudo apt-get purge -y 'postgresql-*'
+        sudo rm -rf /var/lib/postgresql
+        sudo rm -rf /var/log/postgresql
+        sudo rm -rf /etc/postgresql
diff --git a/docs/api-reference/openapi.json b/docs/api-reference/openapi.json
diff --git a/docs/cookbooks/walkthrough.mdx b/docs/cookbooks/walkthrough.mdx
@@ -81,11 +81,11 @@ concurrent_request_limit = 16
   model = "openai/gpt-4o"
   temperature = 0.5
 
-[chunking]
+[ingestion]
 provider = "r2r"
 chunking_strategy = "recursive"
-chunk_size = 512
-chunk_overlap = 256
+chunk_size = 1_024
+chunk_overlap = 512
 excluded_parsers = ["mp4"]
 ```
 

diff --git a/py/core/__init__.py b/py/core/__init__.py
@@ -130,6 +130,7 @@
     "PipeType",
     ## PROVIDERS
     # Base provider classes
+    "AppConfig",
     "Provider",
     "ProviderConfig",
     # Auth provider

diff --git a/py/core/base/__init__.py b/py/core/base/__init__.py
@@ -103,6 +103,7 @@
     "PipeType",
     ## PROVIDERS
     # Base provider classes
+    "AppConfig",
     "Provider",
     "ProviderConfig",
     # Auth provider

diff --git a/py/core/base/api/models/__init__.py b/py/core/base/api/models/__init__.py
@@ -53,10 +53,10 @@
     RAGAgentResponse,
     RAGResponse,
     SearchResponse,
+    WrappedCompletionResponse,
     WrappedRAGAgentResponse,
     WrappedRAGResponse,
     WrappedSearchResponse,
-    WrappedCompletionResponse,
 )
 
 __all__ = [

diff --git a/py/core/base/providers/kg.py b/py/core/base/providers/kg.py
@@ -89,7 +89,7 @@ async def get_entities(
         limit: int,
         entity_ids: list[str] | None = None,
         with_description: bool = False,
-    ) -> list[Entity]:
+    ) -> dict:
         """Abstract method to get entities."""
         pass
 
@@ -100,7 +100,7 @@ async def get_triples(
         offset: int,
         limit: int,
         triple_ids: list[str] | None = None,
-    ) -> list[Triple]:
+    ) -> dict:
         """Abstract method to get triples."""
         pass
 

diff --git a/py/core/configs/r2r_aws_bedrock.toml b/py/core/configs/r2r_aws_bedrock.toml
@@ -7,7 +7,7 @@ require_email_verification = false
 default_admin_email = "[email protected]"
 default_admin_password = "change_me_immediately"
 
-[chunking]
+[ingestion]
 provider = "unstructured_local"
 strategy = "auto"
 chunking_strategy = "by_title"

diff --git a/py/core/main/api/ingestion_router.py b/py/core/main/api/ingestion_router.py
@@ -2,7 +2,7 @@
 import logging
 from io import BytesIO
 from pathlib import Path
-from typing import Optional
+from typing import Optional, Union
 from uuid import UUID
 
 import yaml
@@ -92,8 +92,7 @@ async def ingest_files_app(
                 description=ingest_files_descriptions.get("ingestion_config"),
             ),
             auth_user=Depends(self.service.providers.auth.auth_wrapper),
-            response_model=WrappedIngestionResponse,
-        ):
+        ) -> WrappedIngestionResponse:  # type: ignore
             """
             Ingest files into the system.
 
@@ -119,7 +118,7 @@ async def ingest_files_app(
 
             file_datas = await self._process_files(files)
 
-            messages = []
+            messages: list[dict[str, Union[str, None]]] = []
             for it, file_data in enumerate(file_datas):
                 content_length = len(file_data["content"])
                 file_content = BytesIO(base64.b64decode(file_data["content"]))
@@ -150,7 +149,7 @@ async def ingest_files_app(
                     file_content,
                     file_data["content_type"],
                 )
-                raw_message = await self.orchestration_provider.run_workflow(
+                raw_message: dict[str, Union[str, None]] = await self.orchestration_provider.run_workflow(  # type: ignore
                     "ingest-files",
                     {"request": workflow_input},
                     options={
@@ -160,9 +159,10 @@ async def ingest_files_app(
                     },
                 )
                 raw_message["document_id"] = str(document_id)
+                if "task_id" not in raw_message:
+                    raw_message["task_id"] = None
                 messages.append(raw_message)
-
-            return messages
+            return messages  # type: ignore
 
         update_files_extras = self.openapi_extras.get("update_files", {})
         update_files_descriptions = update_files_extras.get(
@@ -189,8 +189,7 @@ async def update_files_app(
                 description=ingest_files_descriptions.get("ingestion_config"),
             ),
             auth_user=Depends(self.service.providers.auth.auth_wrapper),
-            response_model=WrappedUpdateResponse,
-        ):
+        ) -> WrappedUpdateResponse:
             """
             Update existing files in the system.
 
@@ -257,7 +256,7 @@ async def update_files_app(
             )
             raw_message["message"] = "Update task queued successfully."
             raw_message["document_ids"] = workflow_input["document_ids"]
-            return raw_message
+            return raw_message  # type: ignore
 
         ingest_chunks_extras = self.openapi_extras.get("ingest_chunks", {})
         ingest_chunks_descriptions = ingest_chunks_extras.get(
@@ -280,8 +279,7 @@ async def ingest_chunks_app(
                 None, description=ingest_files_descriptions.get("metadata")
             ),
             auth_user=Depends(self.service.providers.auth.auth_wrapper),
-            response_model=WrappedIngestionResponse,
-        ):
+        ) -> WrappedIngestionResponse:
             """
             Ingest text chunks into the system.
 
@@ -311,7 +309,7 @@ async def ingest_chunks_app(
                 },
             )
             raw_message["document_id"] = str(document_id)
-            return raw_message
+            return raw_message  # type: ignore
 
     @staticmethod
     def _validate_ingestion_config(ingestion_config):

diff --git a/py/core/main/api/kg_router.py b/py/core/main/api/kg_router.py
@@ -72,7 +72,7 @@ async def create_graph(
                 description="Settings for the graph creation process.",
             ),
             auth_user=Depends(self.service.providers.auth.auth_wrapper),
-        ) -> WrappedKGCreationResponse:
+        ) -> WrappedKGCreationResponse:  # type: ignore
             """
             Creating a graph on your documents. This endpoint takes input a list of document ids and KGCreationSettings. If document IDs are not provided, the graph will be created on all documents in the system.
             This step extracts the relevant entities and relationships from the documents and creates a graph based on the extracted information.