diff --git a/.github/workflows/build-docker.yml b/.github/workflows/build-r2r-docker.yml similarity index 99% rename from .github/workflows/build-docker.yml rename to .github/workflows/build-r2r-docker.yml index f41af2ca2..398245ee5 100644 --- a/.github/workflows/build-docker.yml +++ b/.github/workflows/build-r2r-docker.yml @@ -1,4 +1,4 @@ -name: Build and Publish Docker Image +name: Build and Publish R2R Docker Image on: workflow_dispatch: diff --git a/.github/workflows/build-unst-docker.yml b/.github/workflows/build-unst-docker.yml new file mode 100644 index 000000000..0da85052e --- /dev/null +++ b/.github/workflows/build-unst-docker.yml @@ -0,0 +1,54 @@ +name: Build and Publish Unstructured Docker Image + +on: + workflow_dispatch: + +env: + REGISTRY_BASE: ragtoriches + +jobs: + build: + runs-on: ubuntu-latest + steps: + - name: Checkout Repository + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.10' + + - name: Install toml package + run: pip install toml + + - name: Determine version + id: version + run: | + echo "REGISTRY_IMAGE=${{ env.REGISTRY_BASE }}/unst-prod" >> $GITHUB_OUTPUT + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Docker Auth + uses: docker/login-action@v3 + with: + username: ${{ secrets.RAGTORICHES_DOCKER_UNAME }} + password: ${{ secrets.RAGTORICHES_DOCKER_TOKEN }} + + - name: Build and push image + uses: docker/build-push-action@v5 + with: + context: ./py + file: ./py/Dockerfile.unstructured + platforms: linux/amd64,linux/arm64 + push: true + tags: | + ${{ steps.version.outputs.REGISTRY_IMAGE }}:${{ steps.version.outputs.RELEASE_VERSION }} + ${{ steps.version.outputs.REGISTRY_IMAGE }}:latest + provenance: false + sbom: false + + - name: Verify manifest + run: | + docker buildx imagetools inspect ${{ steps.version.outputs.REGISTRY_IMAGE }}:${{ steps.version.outputs.RELEASE_VERSION }} + docker buildx imagetools inspect ${{ steps.version.outputs.REGISTRY_IMAGE }}:latest diff --git a/py/Dockerfile b/py/Dockerfile index 43233ec0b..0ee1c5102 100644 --- a/py/Dockerfile +++ b/py/Dockerfile @@ -1,11 +1,8 @@ FROM python:3.10-slim AS builder -# Install system dependencies (including those needed for Unstructured and OpenCV) +# Install system dependencies RUN apt-get update && apt-get install -y --no-install-recommends \ gcc g++ musl-dev curl libffi-dev gfortran libopenblas-dev \ - tesseract-ocr libtesseract-dev libleptonica-dev pkg-config \ - poppler-utils libmagic1 \ - libgl1-mesa-glx libglib2.0-0 \ && apt-get clean && rm -rf /var/lib/apt/lists/* WORKDIR /app @@ -14,21 +11,21 @@ RUN pip install --no-cache-dir poetry # Copy the entire project into the container COPY . /app + +# Ensure that the working directory is set to /app/py WORKDIR /app/py # Install dependencies RUN poetry config virtualenvs.create false \ && poetry install --extras "core" --no-dev --no-root \ - && pip install --no-cache-dir gunicorn uvicorn + && pip install --no-cache-dir gunicorn uvicorn -# Create the final image +# Final stage to keep the image small FROM python:3.10-slim # Install runtime dependencies RUN apt-get update \ - && apt-get install -y --no-install-recommends \ - curl tesseract-ocr poppler-utils libmagic1 pandoc libreoffice \ - libgl1-mesa-glx libglib2.0-0 \ + && apt-get install -y --no-install-recommends curl \ && apt-get clean && rm -rf /var/lib/apt/lists/* WORKDIR /app @@ -37,18 +34,16 @@ WORKDIR /app COPY --from=builder /usr/local/lib/python3.10/site-packages /usr/local/lib/python3.10/site-packages COPY --from=builder /usr/local/bin /usr/local/bin -# Copy the application and config +# Copy the necessary application files COPY core /app/core COPY r2r /app/r2r COPY r2r.toml /app/r2r.toml COPY pyproject.toml /app/pyproject.toml -# Expose the port ARG PORT=8000 ARG HOST=0.0.0.0 ENV PORT=$PORT HOST=$HOST EXPOSE $PORT -ENV TESSDATA_PREFIX=/usr/share/tesseract-ocr/5/tessdata # Run the application CMD ["sh", "-c", "uvicorn core.main.app_entry:app --host $HOST --port $PORT"] diff --git a/py/Dockerfile.dev b/py/Dockerfile.dev index 554fa6ca0..ee307bbc3 100644 --- a/py/Dockerfile.dev +++ b/py/Dockerfile.dev @@ -1,74 +1,59 @@ -# ======================= -# Builder Stage -# ======================= +# Stage 1: Builder FROM python:3.10-slim AS builder -# Install system dependencies (including those needed for Unstructured and OpenCV) +# Install system dependencies in a single RUN command to reduce layers RUN apt-get update && apt-get install -y --no-install-recommends \ gcc g++ musl-dev curl libffi-dev gfortran libopenblas-dev \ - tesseract-ocr libtesseract-dev libleptonica-dev pkg-config \ - poppler-utils libmagic1 \ - libgl1-mesa-glx libglib2.0-0 \ && apt-get clean && rm -rf /var/lib/apt/lists/* +# Set working directory +WORKDIR /app + # Install Poetry RUN pip install --no-cache-dir poetry -# Create application directory -RUN mkdir -p /app/py -WORKDIR /app/py +# Copy only dependency files first to leverage caching +COPY pyproject.toml poetry.lock /app/ -# Copy Poetry configuration -COPY pyproject.toml /app/py/pyproject.toml +# Set working directory to /app/py for dependency installation +WORKDIR /app -# Install dependencies, including gunicorn, uvicorn, and unstructured +# Install Python dependencies without creating a virtual environment RUN poetry config virtualenvs.create false \ && poetry install --extras "core" --no-dev --no-root \ && pip install --no-cache-dir gunicorn uvicorn -# ======================= -# Final Stage -# ======================= +# Copy only the necessary application files +COPY core /app/core +COPY r2r /app/r2r +COPY r2r.toml /app/r2r.toml + +# Stage 2: Final Image FROM python:3.10-slim -# Install runtime dependencies including Pandoc -RUN apt-get update \ - && apt-get install -y --no-install-recommends \ - curl \ - tesseract-ocr \ - poppler-utils \ - libmagic1 \ - libgl1-mesa-glx \ - libglib2.0-0 \ - pandoc \ - libreoffice \ +# Install runtime dependencies +RUN apt-get update && apt-get install -y --no-install-recommends curl \ && apt-get clean && rm -rf /var/lib/apt/lists/* # Set working directory WORKDIR /app -# Copy Python packages and binaries from builder +# Copy installed Python packages and binaries from the builder stage COPY --from=builder /usr/local/lib/python3.10/site-packages /usr/local/lib/python3.10/site-packages COPY --from=builder /usr/local/bin /usr/local/bin -# Initialize Unstructured models -RUN python -c "from unstructured.partition.model_init import initialize; initialize()" +# Copy only the necessary application files from the builder +COPY --from=builder /app/core /app/core +COPY --from=builder /app/r2r /app/r2r +COPY --from=builder /app/r2r.toml /app/r2r.toml -# Expose the port and set environment variables -ARG PORT=8000 +# Set environment variables for port and host +ARG PORT=7272 ARG HOST=0.0.0.0 -ENV PORT=$PORT \ - HOST=$HOST \ - TESSDATA_PREFIX=/usr/share/tesseract-ocr/5/tessdata - -EXPOSE $PORT +ENV PORT=${PORT} HOST=${HOST} -# Copy application code -COPY . /app -COPY core /app/core -COPY r2r /app/r2r -COPY r2r.toml /app/r2r.toml -COPY pyproject.toml /app/pyproject.toml +# Expose the specified port +EXPOSE ${PORT} -# Set the default command to run the application -CMD ["sh", "-c", "uvicorn core.main.app_entry:app --host $HOST --port $PORT"] +# Use the exec form of CMD for better signal handling +CMD ["uvicorn", "core.main.app_entry:app", "--host", "0.0.0.0", "--port", "7272"] diff --git a/py/Dockerfile.unstructured b/py/Dockerfile.unstructured new file mode 100644 index 000000000..a08c57747 --- /dev/null +++ b/py/Dockerfile.unstructured @@ -0,0 +1,29 @@ +FROM python:3.10-slim AS builder + +# Install system dependencies (including those needed for Unstructured and OpenCV) +RUN apt-get update && apt-get install -y --no-install-recommends \ + gcc g++ musl-dev curl libffi-dev gfortran libopenblas-dev \ + tesseract-ocr libtesseract-dev libleptonica-dev pkg-config \ + poppler-utils libmagic1 pandoc libreoffice \ + libgl1-mesa-glx libglib2.0-0 \ + && apt-get clean && rm -rf /var/lib/apt/lists/* + +ENV TESSDATA_PREFIX=/usr/share/tesseract-ocr/5/tessdata + +ENV PYTHONDONTWRITEBYTECODE=1 +ENV PYTHONUNBUFFERED=1 + +WORKDIR /app + +RUN pip install --no-cache-dir unstructured "unstructured[all-docs]" + + +RUN python -c "from unstructured.partition.model_init import initialize; initialize()" + +RUN pip install gunicorn uvicorn fastapi httpx + +COPY core/integrations/unstructured/main.py . + +EXPOSE 7275 + +CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7275"] \ No newline at end of file diff --git a/py/cli/commands/server.py b/py/cli/commands/server.py index fae8d2519..8bba2311d 100644 --- a/py/cli/commands/server.py +++ b/py/cli/commands/server.py @@ -293,6 +293,8 @@ def image_exists(img): check=True, ) + subprocess.run(["docker", "build", "-t", "unstructured-docker", "-f", "Dockerfile.unstructured", "."], check=True) + if config_path: config_path = os.path.abspath(config_path) diff --git a/py/compose.yaml b/py/compose.yaml index 0df2af320..f24779783 100644 --- a/py/compose.yaml +++ b/py/compose.yaml @@ -20,8 +20,6 @@ networks: labels: - "com.docker.compose.recreate=always" - - services: setup-token: @@ -169,6 +167,8 @@ services: # Unstructured - UNSTRUCTURED_API_KEY=${UNSTRUCTURED_API_KEY:-} - UNSTRUCTURED_API_URL=${UNSTRUCTURED_API_URL:-https://api.unstructured.io/general/v0/general} + - UNSTRUCTURED_LOCAL_URL=${UNSTRUCTURED_LOCAL_URL:-http://unstructured:7275} + - UNSTRUCTURED_NUM_WORKERS=${UNSTRUCTURED_NUM_WORKERS:-10} # Hatchet - HATCHET_CLIENT_TLS_STRATEGY=none @@ -213,6 +213,8 @@ services: condition: service_healthy neo4j: condition: service_healthy + unstructured: + condition: service_healthy r2r-dashboard: image: emrgntcmplxty/r2r-dashboard:latest @@ -225,6 +227,7 @@ services: - "traefik.http.routers.r2r-dashboard.rule=PathPrefix(`/`)" - "traefik.http.services.r2r-dashboard.loadbalancer.server.port=3000" - "traefik.http.routers.r2r-dashboard.entrypoints=r2r" + traefik: image: traefik:v2.9 command: @@ -243,6 +246,19 @@ services: - /var/run/docker.sock:/var/run/docker.sock:ro networks: - r2r-network + + unstructured: + image: ${UNSTRUCTURED_IMAGE:-ragtoriches/unstructured-prod} + ports: + - "7275:7275" + networks: + - r2r-network + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:7275/health"] + interval: 10s + timeout: 5s + retries: 5 + volumes: hatchet_certs: hatchet_config: diff --git a/py/core/examples/data_unstructured/ods.ods b/py/core/examples/data_unstructured/ods.ods deleted file mode 100644 index a70063e26..000000000 Binary files a/py/core/examples/data_unstructured/ods.ods and /dev/null differ diff --git a/py/core/integrations/unstructured/main.py b/py/core/integrations/unstructured/main.py new file mode 100644 index 000000000..b70dd675b --- /dev/null +++ b/py/core/integrations/unstructured/main.py @@ -0,0 +1,50 @@ +from fastapi import FastAPI, HTTPException +from pydantic import BaseModel +from typing import List, Dict +from io import BytesIO +import asyncio +import concurrent.futures +import os +import base64 +from unstructured.partition.auto import partition +import logging + +logger = logging.getLogger(__name__) + +app = FastAPI() + +class PartitionRequestModel(BaseModel): + file_content: bytes + chunking_config: Dict + +class PartitionResponseModel(BaseModel): + elements: List[Dict] + +executor = concurrent.futures.ThreadPoolExecutor(max_workers=int(os.environ.get("MAX_INGESTION_WORKERS", 10))) + +def run_partition(file_content: str, chunking_config: Dict) -> List[Dict]: + file_content_bytes = base64.b64decode(file_content) + file_io = BytesIO(file_content_bytes) + elements = partition(file=file_io, **chunking_config) + return [element.to_dict() for element in elements] + +@app.get("/health") +async def health_endpoint(): + return {"status": "ok"} + +@app.post("/partition", response_model=PartitionResponseModel) +async def partition_endpoint(request: PartitionRequestModel): + try: + logger.info(f"Partitioning request received") + loop = asyncio.get_event_loop() + elements = await loop.run_in_executor( + executor, + run_partition, + request.file_content, + request.chunking_config, + ) + logger.info(f"Partitioning completed") + return PartitionResponseModel(elements=elements) + except Exception as e: + logger.error(f"Error partitioning file: {str(e)}") + raise HTTPException(status_code=500, detail=str(e)) \ No newline at end of file diff --git a/py/core/main/hatchet/ingestion_workflow.py b/py/core/main/hatchet/ingestion_workflow.py index e3764304a..4e53208c9 100644 --- a/py/core/main/hatchet/ingestion_workflow.py +++ b/py/core/main/hatchet/ingestion_workflow.py @@ -20,7 +20,7 @@ class IngestFilesWorkflow: def __init__(self, ingestion_service: IngestionService): self.ingestion_service = ingestion_service - @r2r_hatchet.step() + @r2r_hatchet.step(timeout="60m") async def parse(self, context: Context) -> dict: input_data = context.workflow_input()["request"] parsed_data = IngestionServiceAdapter.parse_ingest_file_input( @@ -43,7 +43,7 @@ async def parse(self, context: Context) -> dict: "document_info": document_info.to_dict(), } - @r2r_hatchet.step(parents=["parse"]) + @r2r_hatchet.step(parents=["parse"], timeout="60m") async def extract(self, context: Context) -> dict: document_info_dict = context.step_output("parse")["document_info"] document_info = DocumentInfo(**document_info_dict) @@ -71,7 +71,7 @@ async def extract(self, context: Context) -> dict: "document_info": document_info.to_dict(), } - @r2r_hatchet.step(parents=["extract"]) + @r2r_hatchet.step(parents=["extract"], timeout="60m") async def chunk(self, context: Context) -> dict: document_info_dict = context.step_output("extract")["document_info"] document_info = DocumentInfo(**document_info_dict) @@ -103,7 +103,7 @@ async def chunk(self, context: Context) -> dict: "document_info": document_info.to_dict(), } - @r2r_hatchet.step(parents=["chunk"]) + @r2r_hatchet.step(parents=["chunk"], timeout="60m") async def embed(self, context: Context) -> dict: document_info_dict = context.step_output("chunk")["document_info"] document_info = DocumentInfo(**document_info_dict) @@ -139,7 +139,7 @@ async def embed(self, context: Context) -> dict: "document_info": document_info.to_dict(), } - @r2r_hatchet.step(parents=["embed"]) + @r2r_hatchet.step(parents=["embed"], timeout="60m") async def finalize(self, context: Context) -> dict: document_info_dict = context.step_output("embed")["document_info"] document_info = DocumentInfo(**document_info_dict) diff --git a/py/core/providers/parsing/unstructured_parsing.py b/py/core/providers/parsing/unstructured_parsing.py index a9382b4fb..c0f9d91e8 100644 --- a/py/core/providers/parsing/unstructured_parsing.py +++ b/py/core/providers/parsing/unstructured_parsing.py @@ -5,6 +5,9 @@ from copy import copy from io import BytesIO from typing import Any, AsyncGenerator +import httpx +import base64 +import json from pydantic import BaseModel from unstructured_client import UnstructuredClient @@ -88,16 +91,16 @@ def __init__(self, use_api: bool, config: ParsingConfig): self.operations = operations else: - try: - from unstructured.partition.auto import partition - - self.partition = partition - except ImportError as e: - raise ImportError( - "Please install the unstructured package to use the unstructured parsing provider." + try: + self.local_unstructured_url = os.environ["UNSTRUCTURED_LOCAL_URL"] + except KeyError as e: + raise ValueError( + "UNSTRUCTURED_LOCAL_URL environment variable is not set" ) from e + self.client = httpx.AsyncClient() + super().__init__(config) self.parsers = {} self._initialize_parsers() @@ -178,12 +181,24 @@ async def parse( else: logger.info( - f"Using local unstructured to parse document {document.id}" - ) - elements = self.partition( - file=file_content, - **self.config.chunking_config.extra_fields, + f"Using local unstructured fastapi server to parse document {document.id}" ) + # Base64 encode the file content + encoded_content = base64.b64encode(file_content.read()).decode('utf-8') + + logger.info(f"Sending a request to {self.local_unstructured_url}/partition") + + elements = await self.client.post( + f"{self.local_unstructured_url}/partition", + json={ + "file_content": encoded_content, # Use encoded string + "chunking_config": self.config.chunking_config.extra_fields, + }, + timeout=300, # Adjust timeout as needed + ) + + elements = elements.json() + elements = elements['elements'] iteration = 0 # if there are no chunks for iteration, element in enumerate(elements): diff --git a/py/poetry.lock b/py/poetry.lock index 833df8308..8840027ab 100644 --- a/py/poetry.lock +++ b/py/poetry.lock @@ -461,17 +461,17 @@ uvloop = ["uvloop (>=0.15.2)"] [[package]] name = "boto3" -version = "1.35.18" +version = "1.35.19" description = "The AWS SDK for Python" optional = false python-versions = ">=3.8" files = [ - {file = "boto3-1.35.18-py3-none-any.whl", hash = "sha256:71e237d3997cf93425947854d7b121c577944f391ba633afb0659e1015364704"}, - {file = "boto3-1.35.18.tar.gz", hash = "sha256:fd130308f1f49d748a5fc63de92de79a995b51c79af3947ddde8815fcf0684fe"}, + {file = "boto3-1.35.19-py3-none-any.whl", hash = "sha256:84b3fe1727945bc3cada832d969ddb3dc0d08fce1677064ca8bdc13a89c1a143"}, + {file = "boto3-1.35.19.tar.gz", hash = "sha256:9979fe674780a0b7100eae9156d74ee374cd1638a9f61c77277e3ce712f3e496"}, ] [package.dependencies] -botocore = ">=1.35.18,<1.36.0" +botocore = ">=1.35.19,<1.36.0" jmespath = ">=0.7.1,<2.0.0" s3transfer = ">=0.10.0,<0.11.0" @@ -480,13 +480,13 @@ crt = ["botocore[crt] (>=1.21.0,<2.0a0)"] [[package]] name = "botocore" -version = "1.35.18" +version = "1.35.19" description = "Low-level, data-driven core of boto 3." optional = false python-versions = ">=3.8" files = [ - {file = "botocore-1.35.18-py3-none-any.whl", hash = "sha256:1027083aeb1fe74057273410fd768e018e22f85adfbd717b5a69f578f7812b80"}, - {file = "botocore-1.35.18.tar.gz", hash = "sha256:e59da8b91ab06683d2725b6cbbb0383b30c68a241c3c63363f4c5bff59b3c0c0"}, + {file = "botocore-1.35.19-py3-none-any.whl", hash = "sha256:c83f7f0cacfe7c19b109b363ebfa8736e570d24922f16ed371681f58ebab44a9"}, + {file = "botocore-1.35.19.tar.gz", hash = "sha256:42d6d8db7250cbd7899f786f9861e02cab17dc238f64d6acb976098ed9809625"}, ] [package.dependencies] @@ -1829,13 +1829,13 @@ files = [ [[package]] name = "hatchet-sdk" -version = "0.36.21" +version = "0.36.22" description = "" optional = false python-versions = "<4.0,>=3.10" files = [ - {file = "hatchet_sdk-0.36.21-py3-none-any.whl", hash = "sha256:cea35d5808cd960c0cb6e8fc7286ce8e11d253ba8c976dbd037ca14e978e1f6f"}, - {file = "hatchet_sdk-0.36.21.tar.gz", hash = "sha256:5c4532490afa8db71800a160e07d205a1c956b7678e3ef46e0bb93fa289463c6"}, + {file = "hatchet_sdk-0.36.22-py3-none-any.whl", hash = "sha256:ec69de6a6308ffbf730e1b6361b00224e3f2932b8070bd9c59987ea2d79cfa8d"}, + {file = "hatchet_sdk-0.36.22.tar.gz", hash = "sha256:e8ead3b30c21b8bf3f13ae799125eb8a84e612caf8f50a2eec395946939bf650"}, ] [package.dependencies] @@ -6009,6 +6009,11 @@ files = [ {file = "triton-3.0.0-1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:34e509deb77f1c067d8640725ef00c5cbfcb2052a1a3cb6a6d343841f92624eb"}, {file = "triton-3.0.0-1-cp38-cp38-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:bcbf3b1c48af6a28011a5c40a5b3b9b5330530c3827716b5fbf6d7adcc1e53e9"}, {file = "triton-3.0.0-1-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:6e5727202f7078c56f91ff13ad0c1abab14a0e7f2c87e91b12b6f64f3e8ae609"}, + {file = "triton-3.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:39b052da883351fdf6be3d93cedae6db3b8e3988d3b09ed221bccecfa9612230"}, + {file = "triton-3.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cd34f19a8582af96e6291d4afce25dac08cb2a5d218c599163761e8e0827208e"}, + {file = "triton-3.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0d5e10de8c011adeb7c878c6ce0dd6073b14367749e34467f1cff2bde1b78253"}, + {file = "triton-3.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e8903767951bf86ec960b4fe4e21bc970055afc65e9d57e916d79ae3c93665e3"}, + {file = "triton-3.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:41004fb1ae9a53fcb3e970745feb87f0e3c94c6ce1ba86e95fa3b8537894bef7"}, ] [package.dependencies]