Skip to content

Commit

Permalink
Enhance unstructured (#1168)
Browse files Browse the repository at this point in the history
* up

* up

* up

* up

* up

* workflow

* up

* Update py/core/integrations/unstructured/main.py

Co-authored-by: ellipsis-dev[bot] <65095814+ellipsis-dev[bot]@users.noreply.github.com>

* up

---------

Co-authored-by: ellipsis-dev[bot] <65095814+ellipsis-dev[bot]@users.noreply.github.com>
  • Loading branch information
shreyaspimpalgaonkar and ellipsis-dev[bot] authored Sep 13, 2024
1 parent f077df0 commit 30c7f7e
Show file tree
Hide file tree
Showing 12 changed files with 238 additions and 87 deletions.
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: Build and Publish Docker Image
name: Build and Publish R2R Docker Image

on:
workflow_dispatch:
Expand Down
54 changes: 54 additions & 0 deletions .github/workflows/build-unst-docker.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
name: Build and Publish Unstructured Docker Image

on:
workflow_dispatch:

env:
REGISTRY_BASE: ragtoriches

jobs:
build:
runs-on: ubuntu-latest
steps:
- name: Checkout Repository
uses: actions/checkout@v4

- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: '3.10'

- name: Install toml package
run: pip install toml

- name: Determine version
id: version
run: |
echo "REGISTRY_IMAGE=${{ env.REGISTRY_BASE }}/unst-prod" >> $GITHUB_OUTPUT
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3

- name: Docker Auth
uses: docker/login-action@v3
with:
username: ${{ secrets.RAGTORICHES_DOCKER_UNAME }}
password: ${{ secrets.RAGTORICHES_DOCKER_TOKEN }}

- name: Build and push image
uses: docker/build-push-action@v5
with:
context: ./py
file: ./py/Dockerfile.unstructured
platforms: linux/amd64,linux/arm64
push: true
tags: |
${{ steps.version.outputs.REGISTRY_IMAGE }}:${{ steps.version.outputs.RELEASE_VERSION }}
${{ steps.version.outputs.REGISTRY_IMAGE }}:latest
provenance: false
sbom: false

- name: Verify manifest
run: |
docker buildx imagetools inspect ${{ steps.version.outputs.REGISTRY_IMAGE }}:${{ steps.version.outputs.RELEASE_VERSION }}
docker buildx imagetools inspect ${{ steps.version.outputs.REGISTRY_IMAGE }}:latest
19 changes: 7 additions & 12 deletions py/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,11 +1,8 @@
FROM python:3.10-slim AS builder

# Install system dependencies (including those needed for Unstructured and OpenCV)
# Install system dependencies
RUN apt-get update && apt-get install -y --no-install-recommends \
gcc g++ musl-dev curl libffi-dev gfortran libopenblas-dev \
tesseract-ocr libtesseract-dev libleptonica-dev pkg-config \
poppler-utils libmagic1 \
libgl1-mesa-glx libglib2.0-0 \
&& apt-get clean && rm -rf /var/lib/apt/lists/*

WORKDIR /app
Expand All @@ -14,21 +11,21 @@ RUN pip install --no-cache-dir poetry

# Copy the entire project into the container
COPY . /app

# Ensure that the working directory is set to /app/py
WORKDIR /app/py

# Install dependencies
RUN poetry config virtualenvs.create false \
&& poetry install --extras "core" --no-dev --no-root \
&& pip install --no-cache-dir gunicorn uvicorn
&& pip install --no-cache-dir gunicorn uvicorn

# Create the final image
# Final stage to keep the image small
FROM python:3.10-slim

# Install runtime dependencies
RUN apt-get update \
&& apt-get install -y --no-install-recommends \
curl tesseract-ocr poppler-utils libmagic1 pandoc libreoffice \
libgl1-mesa-glx libglib2.0-0 \
&& apt-get install -y --no-install-recommends curl \
&& apt-get clean && rm -rf /var/lib/apt/lists/*

WORKDIR /app
Expand All @@ -37,18 +34,16 @@ WORKDIR /app
COPY --from=builder /usr/local/lib/python3.10/site-packages /usr/local/lib/python3.10/site-packages
COPY --from=builder /usr/local/bin /usr/local/bin

# Copy the application and config
# Copy the necessary application files
COPY core /app/core
COPY r2r /app/r2r
COPY r2r.toml /app/r2r.toml
COPY pyproject.toml /app/pyproject.toml

# Expose the port
ARG PORT=8000
ARG HOST=0.0.0.0
ENV PORT=$PORT HOST=$HOST
EXPOSE $PORT
ENV TESSDATA_PREFIX=/usr/share/tesseract-ocr/5/tessdata

# Run the application
CMD ["sh", "-c", "uvicorn core.main.app_entry:app --host $HOST --port $PORT"]
75 changes: 30 additions & 45 deletions py/Dockerfile.dev
Original file line number Diff line number Diff line change
@@ -1,74 +1,59 @@
# =======================
# Builder Stage
# =======================
# Stage 1: Builder
FROM python:3.10-slim AS builder

# Install system dependencies (including those needed for Unstructured and OpenCV)
# Install system dependencies in a single RUN command to reduce layers
RUN apt-get update && apt-get install -y --no-install-recommends \
gcc g++ musl-dev curl libffi-dev gfortran libopenblas-dev \
tesseract-ocr libtesseract-dev libleptonica-dev pkg-config \
poppler-utils libmagic1 \
libgl1-mesa-glx libglib2.0-0 \
&& apt-get clean && rm -rf /var/lib/apt/lists/*

# Set working directory
WORKDIR /app

# Install Poetry
RUN pip install --no-cache-dir poetry

# Create application directory
RUN mkdir -p /app/py
WORKDIR /app/py
# Copy only dependency files first to leverage caching
COPY pyproject.toml poetry.lock /app/

# Copy Poetry configuration
COPY pyproject.toml /app/py/pyproject.toml
# Set working directory to /app/py for dependency installation
WORKDIR /app

# Install dependencies, including gunicorn, uvicorn, and unstructured
# Install Python dependencies without creating a virtual environment
RUN poetry config virtualenvs.create false \
&& poetry install --extras "core" --no-dev --no-root \
&& pip install --no-cache-dir gunicorn uvicorn

# =======================
# Final Stage
# =======================
# Copy only the necessary application files
COPY core /app/core
COPY r2r /app/r2r
COPY r2r.toml /app/r2r.toml

# Stage 2: Final Image
FROM python:3.10-slim

# Install runtime dependencies including Pandoc
RUN apt-get update \
&& apt-get install -y --no-install-recommends \
curl \
tesseract-ocr \
poppler-utils \
libmagic1 \
libgl1-mesa-glx \
libglib2.0-0 \
pandoc \
libreoffice \
# Install runtime dependencies
RUN apt-get update && apt-get install -y --no-install-recommends curl \
&& apt-get clean && rm -rf /var/lib/apt/lists/*

# Set working directory
WORKDIR /app

# Copy Python packages and binaries from builder
# Copy installed Python packages and binaries from the builder stage
COPY --from=builder /usr/local/lib/python3.10/site-packages /usr/local/lib/python3.10/site-packages
COPY --from=builder /usr/local/bin /usr/local/bin

# Initialize Unstructured models
RUN python -c "from unstructured.partition.model_init import initialize; initialize()"
# Copy only the necessary application files from the builder
COPY --from=builder /app/core /app/core
COPY --from=builder /app/r2r /app/r2r
COPY --from=builder /app/r2r.toml /app/r2r.toml

# Expose the port and set environment variables
ARG PORT=8000
# Set environment variables for port and host
ARG PORT=7272
ARG HOST=0.0.0.0
ENV PORT=$PORT \
HOST=$HOST \
TESSDATA_PREFIX=/usr/share/tesseract-ocr/5/tessdata

EXPOSE $PORT
ENV PORT=${PORT} HOST=${HOST}

# Copy application code
COPY . /app
COPY core /app/core
COPY r2r /app/r2r
COPY r2r.toml /app/r2r.toml
COPY pyproject.toml /app/pyproject.toml
# Expose the specified port
EXPOSE ${PORT}

# Set the default command to run the application
CMD ["sh", "-c", "uvicorn core.main.app_entry:app --host $HOST --port $PORT"]
# Use the exec form of CMD for better signal handling
CMD ["uvicorn", "core.main.app_entry:app", "--host", "0.0.0.0", "--port", "7272"]
29 changes: 29 additions & 0 deletions py/Dockerfile.unstructured
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
FROM python:3.10-slim AS builder

# Install system dependencies (including those needed for Unstructured and OpenCV)
RUN apt-get update && apt-get install -y --no-install-recommends \
gcc g++ musl-dev curl libffi-dev gfortran libopenblas-dev \
tesseract-ocr libtesseract-dev libleptonica-dev pkg-config \
poppler-utils libmagic1 pandoc libreoffice \
libgl1-mesa-glx libglib2.0-0 \
&& apt-get clean && rm -rf /var/lib/apt/lists/*

ENV TESSDATA_PREFIX=/usr/share/tesseract-ocr/5/tessdata

ENV PYTHONDONTWRITEBYTECODE=1
ENV PYTHONUNBUFFERED=1

WORKDIR /app

RUN pip install --no-cache-dir unstructured "unstructured[all-docs]"


RUN python -c "from unstructured.partition.model_init import initialize; initialize()"

RUN pip install gunicorn uvicorn fastapi httpx

COPY core/integrations/unstructured/main.py .

EXPOSE 7275

CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7275"]
2 changes: 2 additions & 0 deletions py/cli/commands/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -293,6 +293,8 @@ def image_exists(img):
check=True,
)

subprocess.run(["docker", "build", "-t", "unstructured-docker", "-f", "Dockerfile.unstructured", "."], check=True)

if config_path:
config_path = os.path.abspath(config_path)

Expand Down
20 changes: 18 additions & 2 deletions py/compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,6 @@ networks:
labels:
- "com.docker.compose.recreate=always"



services:

setup-token:
Expand Down Expand Up @@ -169,6 +167,8 @@ services:
# Unstructured
- UNSTRUCTURED_API_KEY=${UNSTRUCTURED_API_KEY:-}
- UNSTRUCTURED_API_URL=${UNSTRUCTURED_API_URL:-https://api.unstructured.io/general/v0/general}
- UNSTRUCTURED_LOCAL_URL=${UNSTRUCTURED_LOCAL_URL:-http://unstructured:7275}
- UNSTRUCTURED_NUM_WORKERS=${UNSTRUCTURED_NUM_WORKERS:-10}

# Hatchet
- HATCHET_CLIENT_TLS_STRATEGY=none
Expand Down Expand Up @@ -213,6 +213,8 @@ services:
condition: service_healthy
neo4j:
condition: service_healthy
unstructured:
condition: service_healthy

r2r-dashboard:
image: emrgntcmplxty/r2r-dashboard:latest
Expand All @@ -225,6 +227,7 @@ services:
- "traefik.http.routers.r2r-dashboard.rule=PathPrefix(`/`)"
- "traefik.http.services.r2r-dashboard.loadbalancer.server.port=3000"
- "traefik.http.routers.r2r-dashboard.entrypoints=r2r"

traefik:
image: traefik:v2.9
command:
Expand All @@ -243,6 +246,19 @@ services:
- /var/run/docker.sock:/var/run/docker.sock:ro
networks:
- r2r-network

unstructured:
image: ${UNSTRUCTURED_IMAGE:-ragtoriches/unstructured-prod}
ports:
- "7275:7275"
networks:
- r2r-network
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:7275/health"]
interval: 10s
timeout: 5s
retries: 5

volumes:
hatchet_certs:
hatchet_config:
Expand Down
Binary file removed py/core/examples/data_unstructured/ods.ods
Binary file not shown.
50 changes: 50 additions & 0 deletions py/core/integrations/unstructured/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from typing import List, Dict
from io import BytesIO
import asyncio
import concurrent.futures
import os
import base64
from unstructured.partition.auto import partition
import logging

logger = logging.getLogger(__name__)

app = FastAPI()

class PartitionRequestModel(BaseModel):
file_content: bytes
chunking_config: Dict

class PartitionResponseModel(BaseModel):
elements: List[Dict]

executor = concurrent.futures.ThreadPoolExecutor(max_workers=int(os.environ.get("MAX_INGESTION_WORKERS", 10)))

def run_partition(file_content: str, chunking_config: Dict) -> List[Dict]:
file_content_bytes = base64.b64decode(file_content)
file_io = BytesIO(file_content_bytes)
elements = partition(file=file_io, **chunking_config)
return [element.to_dict() for element in elements]

@app.get("/health")
async def health_endpoint():
return {"status": "ok"}

@app.post("/partition", response_model=PartitionResponseModel)
async def partition_endpoint(request: PartitionRequestModel):
try:
logger.info(f"Partitioning request received")
loop = asyncio.get_event_loop()
elements = await loop.run_in_executor(
executor,
run_partition,
request.file_content,
request.chunking_config,
)
logger.info(f"Partitioning completed")
return PartitionResponseModel(elements=elements)
except Exception as e:
logger.error(f"Error partitioning file: {str(e)}")
raise HTTPException(status_code=500, detail=str(e))
Loading

0 comments on commit 30c7f7e

Please sign in to comment.