-
Notifications
You must be signed in to change notification settings - Fork 252
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* up * up * up * up * up * workflow * up * Update py/core/integrations/unstructured/main.py Co-authored-by: ellipsis-dev[bot] <65095814+ellipsis-dev[bot]@users.noreply.github.com> * up --------- Co-authored-by: ellipsis-dev[bot] <65095814+ellipsis-dev[bot]@users.noreply.github.com>
- Loading branch information
1 parent
f077df0
commit 30c7f7e
Showing
12 changed files
with
238 additions
and
87 deletions.
There are no files selected for viewing
2 changes: 1 addition & 1 deletion
2
.github/workflows/build-docker.yml → .github/workflows/build-r2r-docker.yml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,4 @@ | ||
name: Build and Publish Docker Image | ||
name: Build and Publish R2R Docker Image | ||
|
||
on: | ||
workflow_dispatch: | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
name: Build and Publish Unstructured Docker Image | ||
|
||
on: | ||
workflow_dispatch: | ||
|
||
env: | ||
REGISTRY_BASE: ragtoriches | ||
|
||
jobs: | ||
build: | ||
runs-on: ubuntu-latest | ||
steps: | ||
- name: Checkout Repository | ||
uses: actions/checkout@v4 | ||
|
||
- name: Set up Python | ||
uses: actions/setup-python@v4 | ||
with: | ||
python-version: '3.10' | ||
|
||
- name: Install toml package | ||
run: pip install toml | ||
|
||
- name: Determine version | ||
id: version | ||
run: | | ||
echo "REGISTRY_IMAGE=${{ env.REGISTRY_BASE }}/unst-prod" >> $GITHUB_OUTPUT | ||
- name: Set up Docker Buildx | ||
uses: docker/setup-buildx-action@v3 | ||
|
||
- name: Docker Auth | ||
uses: docker/login-action@v3 | ||
with: | ||
username: ${{ secrets.RAGTORICHES_DOCKER_UNAME }} | ||
password: ${{ secrets.RAGTORICHES_DOCKER_TOKEN }} | ||
|
||
- name: Build and push image | ||
uses: docker/build-push-action@v5 | ||
with: | ||
context: ./py | ||
file: ./py/Dockerfile.unstructured | ||
platforms: linux/amd64,linux/arm64 | ||
push: true | ||
tags: | | ||
${{ steps.version.outputs.REGISTRY_IMAGE }}:${{ steps.version.outputs.RELEASE_VERSION }} | ||
${{ steps.version.outputs.REGISTRY_IMAGE }}:latest | ||
provenance: false | ||
sbom: false | ||
|
||
- name: Verify manifest | ||
run: | | ||
docker buildx imagetools inspect ${{ steps.version.outputs.REGISTRY_IMAGE }}:${{ steps.version.outputs.RELEASE_VERSION }} | ||
docker buildx imagetools inspect ${{ steps.version.outputs.REGISTRY_IMAGE }}:latest |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,74 +1,59 @@ | ||
# ======================= | ||
# Builder Stage | ||
# ======================= | ||
# Stage 1: Builder | ||
FROM python:3.10-slim AS builder | ||
|
||
# Install system dependencies (including those needed for Unstructured and OpenCV) | ||
# Install system dependencies in a single RUN command to reduce layers | ||
RUN apt-get update && apt-get install -y --no-install-recommends \ | ||
gcc g++ musl-dev curl libffi-dev gfortran libopenblas-dev \ | ||
tesseract-ocr libtesseract-dev libleptonica-dev pkg-config \ | ||
poppler-utils libmagic1 \ | ||
libgl1-mesa-glx libglib2.0-0 \ | ||
&& apt-get clean && rm -rf /var/lib/apt/lists/* | ||
|
||
# Set working directory | ||
WORKDIR /app | ||
|
||
# Install Poetry | ||
RUN pip install --no-cache-dir poetry | ||
|
||
# Create application directory | ||
RUN mkdir -p /app/py | ||
WORKDIR /app/py | ||
# Copy only dependency files first to leverage caching | ||
COPY pyproject.toml poetry.lock /app/ | ||
|
||
# Copy Poetry configuration | ||
COPY pyproject.toml /app/py/pyproject.toml | ||
# Set working directory to /app/py for dependency installation | ||
WORKDIR /app | ||
|
||
# Install dependencies, including gunicorn, uvicorn, and unstructured | ||
# Install Python dependencies without creating a virtual environment | ||
RUN poetry config virtualenvs.create false \ | ||
&& poetry install --extras "core" --no-dev --no-root \ | ||
&& pip install --no-cache-dir gunicorn uvicorn | ||
|
||
# ======================= | ||
# Final Stage | ||
# ======================= | ||
# Copy only the necessary application files | ||
COPY core /app/core | ||
COPY r2r /app/r2r | ||
COPY r2r.toml /app/r2r.toml | ||
|
||
# Stage 2: Final Image | ||
FROM python:3.10-slim | ||
|
||
# Install runtime dependencies including Pandoc | ||
RUN apt-get update \ | ||
&& apt-get install -y --no-install-recommends \ | ||
curl \ | ||
tesseract-ocr \ | ||
poppler-utils \ | ||
libmagic1 \ | ||
libgl1-mesa-glx \ | ||
libglib2.0-0 \ | ||
pandoc \ | ||
libreoffice \ | ||
# Install runtime dependencies | ||
RUN apt-get update && apt-get install -y --no-install-recommends curl \ | ||
&& apt-get clean && rm -rf /var/lib/apt/lists/* | ||
|
||
# Set working directory | ||
WORKDIR /app | ||
|
||
# Copy Python packages and binaries from builder | ||
# Copy installed Python packages and binaries from the builder stage | ||
COPY --from=builder /usr/local/lib/python3.10/site-packages /usr/local/lib/python3.10/site-packages | ||
COPY --from=builder /usr/local/bin /usr/local/bin | ||
|
||
# Initialize Unstructured models | ||
RUN python -c "from unstructured.partition.model_init import initialize; initialize()" | ||
# Copy only the necessary application files from the builder | ||
COPY --from=builder /app/core /app/core | ||
COPY --from=builder /app/r2r /app/r2r | ||
COPY --from=builder /app/r2r.toml /app/r2r.toml | ||
|
||
# Expose the port and set environment variables | ||
ARG PORT=8000 | ||
# Set environment variables for port and host | ||
ARG PORT=7272 | ||
ARG HOST=0.0.0.0 | ||
ENV PORT=$PORT \ | ||
HOST=$HOST \ | ||
TESSDATA_PREFIX=/usr/share/tesseract-ocr/5/tessdata | ||
|
||
EXPOSE $PORT | ||
ENV PORT=${PORT} HOST=${HOST} | ||
|
||
# Copy application code | ||
COPY . /app | ||
COPY core /app/core | ||
COPY r2r /app/r2r | ||
COPY r2r.toml /app/r2r.toml | ||
COPY pyproject.toml /app/pyproject.toml | ||
# Expose the specified port | ||
EXPOSE ${PORT} | ||
|
||
# Set the default command to run the application | ||
CMD ["sh", "-c", "uvicorn core.main.app_entry:app --host $HOST --port $PORT"] | ||
# Use the exec form of CMD for better signal handling | ||
CMD ["uvicorn", "core.main.app_entry:app", "--host", "0.0.0.0", "--port", "7272"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
FROM python:3.10-slim AS builder | ||
|
||
# Install system dependencies (including those needed for Unstructured and OpenCV) | ||
RUN apt-get update && apt-get install -y --no-install-recommends \ | ||
gcc g++ musl-dev curl libffi-dev gfortran libopenblas-dev \ | ||
tesseract-ocr libtesseract-dev libleptonica-dev pkg-config \ | ||
poppler-utils libmagic1 pandoc libreoffice \ | ||
libgl1-mesa-glx libglib2.0-0 \ | ||
&& apt-get clean && rm -rf /var/lib/apt/lists/* | ||
|
||
ENV TESSDATA_PREFIX=/usr/share/tesseract-ocr/5/tessdata | ||
|
||
ENV PYTHONDONTWRITEBYTECODE=1 | ||
ENV PYTHONUNBUFFERED=1 | ||
|
||
WORKDIR /app | ||
|
||
RUN pip install --no-cache-dir unstructured "unstructured[all-docs]" | ||
|
||
|
||
RUN python -c "from unstructured.partition.model_init import initialize; initialize()" | ||
|
||
RUN pip install gunicorn uvicorn fastapi httpx | ||
|
||
COPY core/integrations/unstructured/main.py . | ||
|
||
EXPOSE 7275 | ||
|
||
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7275"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
from fastapi import FastAPI, HTTPException | ||
from pydantic import BaseModel | ||
from typing import List, Dict | ||
from io import BytesIO | ||
import asyncio | ||
import concurrent.futures | ||
import os | ||
import base64 | ||
from unstructured.partition.auto import partition | ||
import logging | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
app = FastAPI() | ||
|
||
class PartitionRequestModel(BaseModel): | ||
file_content: bytes | ||
chunking_config: Dict | ||
|
||
class PartitionResponseModel(BaseModel): | ||
elements: List[Dict] | ||
|
||
executor = concurrent.futures.ThreadPoolExecutor(max_workers=int(os.environ.get("MAX_INGESTION_WORKERS", 10))) | ||
|
||
def run_partition(file_content: str, chunking_config: Dict) -> List[Dict]: | ||
file_content_bytes = base64.b64decode(file_content) | ||
file_io = BytesIO(file_content_bytes) | ||
elements = partition(file=file_io, **chunking_config) | ||
return [element.to_dict() for element in elements] | ||
|
||
@app.get("/health") | ||
async def health_endpoint(): | ||
return {"status": "ok"} | ||
|
||
@app.post("/partition", response_model=PartitionResponseModel) | ||
async def partition_endpoint(request: PartitionRequestModel): | ||
try: | ||
logger.info(f"Partitioning request received") | ||
loop = asyncio.get_event_loop() | ||
elements = await loop.run_in_executor( | ||
executor, | ||
run_partition, | ||
request.file_content, | ||
request.chunking_config, | ||
) | ||
logger.info(f"Partitioning completed") | ||
return PartitionResponseModel(elements=elements) | ||
except Exception as e: | ||
logger.error(f"Error partitioning file: {str(e)}") | ||
raise HTTPException(status_code=500, detail=str(e)) |
Oops, something went wrong.