Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Create example.env #236

Merged
merged 8 commits into from
Feb 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions dj_backend_server/CHANGELOG.MD
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
2.14.2024
- Added example.env to streamline environment setup.
- Implemented translation fixes to enhance application localization.
- Updated docker-compose.yaml to prefix each container with oc_ for better namespace management.
- Performed fixes in requirements.txt for improved dependency resolution.
- Ensured existence of Vector Database (QDrant) prior to web crawling operations to address issues encountered with large websites, ensuring Vector Database creation and availability.
6 changes: 4 additions & 2 deletions dj_backend_server/api/data_sources/pdf_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,15 +222,17 @@ def txt_to_vectordb(shared_folder: str, namespace: str, delete_folder_flag: bool

docs = text_splitter.split_documents(raw_docs)

print("external files docs -->", docs);
# print("external files docs -->", docs);

if not docs:
print("No documents were processed successfully.")
return

embeddings = get_embeddings()

print(f"Initializing vector store for namespace: {namespace} with {len(docs)} documents.")
init_vector_store(docs, embeddings, StoreOptions(namespace=namespace))
print(f"Vector store initialized successfully for namespace: {namespace}.")

print(f'Folder need or not to delete. {delete_folder_flag}')
# Delete folder if flag is set
Expand All @@ -243,7 +245,7 @@ def txt_to_vectordb(shared_folder: str, namespace: str, delete_folder_flag: bool
# pdf_data_source.save()
failed_job = FailedJob(uuid=str(uuid4()), connection='default', queue='default', payload='txt_to_vectordb', exception=str(e),failed_at=timezone.now())
failed_job.save()
print(e)
print(f"Failed to initialize vector store for namespace: {namespace}. Exception: {e}")
traceback.print_exc()


Expand Down
28 changes: 28 additions & 0 deletions dj_backend_server/api/utils/init_vector_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,3 +128,31 @@ def delete_from_vector_store(namespace: str, filter_criteria: dict) -> None:
else:
raise NotImplementedError(f"Delete operation is not implemented for the store type: {store_type}")


def ensure_vector_database_exists(namespace):
store_type = StoreType[os.environ['STORE']]
try:
if store_type == StoreType.QDRANT:
client = QdrantClient(url=os.environ['QDRANT_URL'])
for attempt in range(3):
existing_collections = client.get_collections().collections
if namespace not in existing_collections:
print(f"Namespace '{namespace}' does not exist. Attempting to create.")
vectors_config = models.VectorParams(
size=1536, # Using 1536-dimensional vectors, adjust as necessary
distance=models.Distance.COSINE # Using cosine distance, adjust as necessary
)
client.create_collection(collection_name=namespace, vectors_config=vectors_config)
# Recheck if the namespace was successfully created
if namespace in client.get_collections().collections:
print(f"Namespace '{namespace}' successfully created.")
return
else:
print(f"Failed to create namespace '{namespace}' on attempt {attempt + 1}.")
else:
print(f"Namespace '{namespace}' exists.")
return
raise Exception(f"Failed to ensure or create namespace '{namespace}' after 3 attempts.")
except Exception as e:
print(f"Failed to ensure vector database exists for namespace {namespace}: {e}")

10 changes: 6 additions & 4 deletions dj_backend_server/docker-compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ version: '3.9'
services:
mysql:
restart: unless-stopped
platform: linux/arm64/v8
container_name: oc_mysql
image: "mysql:8"
ports:
- "3307:3306"
Expand All @@ -20,6 +20,7 @@ services:

qdrant:
image: qdrant/qdrant
container_name: oc_qdrant
ports:
- 6333:6333
- 6334:6334
Expand All @@ -32,7 +33,7 @@ services:
build:
context: .
dockerfile: Dockerfile
container_name: web
container_name: oc_web
ports:
- "8001:8000"
volumes:
Expand All @@ -53,6 +54,7 @@ services:

adminer:
image: adminer
container_name: oc_adminer
ports:
- "8080:8080"
environment:
Expand All @@ -66,7 +68,7 @@ services:
build:
context: .
dockerfile: Dockerfile
container_name: celery
container_name: oc_celery
volumes:
- ./website_data_sources:/app/website_data_sources
# - ./llama-2-7b-chat.ggmlv3.q4_K_M.bin:/app/llama-2-7b-chat.ggmlv3.q4_K_M.bin:ro
Expand All @@ -80,7 +82,7 @@ services:

redis:
image: redis:latest
container_name: redis_cache
container_name: oc_redis_cache
ports:
- "6379:6379"
volumes:
Expand Down
75 changes: 75 additions & 0 deletions dj_backend_server/example.env
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
##########################################################

# Edit values for your site.
# your app secret key
SECRET_KEY='ADD-YOUR-CUSTOM-KEY-HERE'
# For openai
OPENAI_API_KEY=YOURKEY
# add IP what you allow like superadmin
ALLOWED_HOSTS=localhost,0.0.0.0
# Use * only in dev environment
#ALLOWED_HOSTS=*
# Your SITE URL
APP_URL='https://YOUR-URL-HERE'

##########################################################

# "azure" | "openai" | llama2
OPENAI_API_TYPE=openai
OPENAI_API_MODEL=gpt-4-1106-preview
OPENAI_API_TEMPERATURE=1

# If using azure
# AZURE_OPENAI_API_BASE=
# AZURE_OPENAI_API_KEY=
# AZURE_OPENAI_API_VERSION=2023-03-15-preview
# AZURE_OPENAI_EMBEDDING_MODEL_NAME=
# AZURE_OPENAI_DEPLOYMENT_NAME=
# AZURE_OPENAI_COMPLETION_MODEL=gpt-35-turbo

# "azure" | "openai" | llama2
EMBEDDING_PROVIDER=openai

# Vector Store, PINECONE|QDRANT
STORE=QDRANT


# if using pinecone
# PINECONE_API_KEY=
# PINECONE_ENV=
# VECTOR_STORE_INDEX_NAME=


# if using qdrant
QDRANT_URL=http://qdrant:6333


# optional, defaults to 15
MAX_PAGES_CRAWL=150

# --- these will change if you decide to start testing the software
CELERY_BROKER_URL=redis://redis:6379/
CELERY_RESULT_BACKEND=redis://redis:6379/
DATABASE_NAME=openchat
DATABASE_USER=dbuser
DATABASE_PASSWORD=dbpass
DATABASE_HOST=mysql
DATABASE_PORT=3306

# use 'external' if you want to use below services.
PDF_LIBRARY = 'external'

#PDF API - OCRWebService.com (REST API). https://www.ocrwebservice.com/api/restguide
#Extract text from scanned images and PDF documents and convert into editable formats.
#Please create new account with ocrwebservice.com via http://www.ocrwebservice.com/account/signup and get license code
OCR_LICCODE = 'LICENSE-CODE'
OCR_USERNAME = 'USERNAME'
OCR_LANGUAGE = 'english'
# Advantage to clean up the OCR text which can be messy and full with garbage, but will generate a cost with LLM if is paid. Use carefully.
# Use 1 to enable, 0 to disable.
OCR_LLM = '1'

# retrieval_qa | conversation_retrieval, retrieval_qa works better with azure openai
# if you want to use the conversation_retrieval | retrieval_qa chain
CHAIN_TYPE=conversation_retrieval

7 changes: 3 additions & 4 deletions dj_backend_server/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,7 @@ click-repl==0.3.0
cryptography==41.0.3
dataclasses-json==0.5.14
Django==4.2.3
django-rest-swagger
djangorestframework
django-rest-swagger==2.2.0
dnspython==2.4.1
drf-spectacular==0.27.1
drf_spectacular.extensions==0.0.2
Expand All @@ -31,8 +30,8 @@ grpcio-tools==1.56.2
h11==0.14.0
h2==4.1.0
hpack==4.0.0
httpcore1.0.2
httpx=0.25.2
httpcore==1.0.2
httpx==0.25.2
hyperframe==6.0.1
idna==3.6
kombu==5.3.1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -79,20 +79,16 @@ <h1 class="text-3xl text-slate-800 font-bold mb-6">{% trans 'Website information

<div class="flex items-center justify-between space-x-6 mb-8">
<div>
<div class="font-medium text-slate-800 text-sm mb-1">{% trans "Just to make sure we are on
the same page" %}</div>
<div class="font-medium text-slate-800 text-sm mb-1">{% trans "Just to make sure we are on the same page" %}</div>
<div class="text-xs">
{% trans "Sometimes, we might face challenges when trying to crawl certain websites,
especially the ones built using JavaScript (Single-Page Applications). However, we're
currently working on adding headless browsing to our system so that we can support all
kinds of websites." %}
{% trans "Sometimes, we might face challenges when trying to crawl certain websites, especially the ones built using JavaScript (Single-Page Applications). However, we're currently working on adding headless browsing to our system so that we can support all kinds of websites." %}
</div>
</div>
</div>
</div>
<div class="flex items-center justify-between">
<a class="text-sm underline hover:no-underline" href="{% url 'onboarding.data-source' %}">&lt;- {% trans "Back" %}</a>
<button type="submit" class="btn bg-indigo-500 hover:bg-indigo-600 text-white ml-auto">{% trans "Next Step" %}</button>
<button type="submit" class="btn bg-primary text-white py-2 px-3">{% trans "Next Step" %}</button>
</div>
</form>
</div>
Expand Down
3 changes: 1 addition & 2 deletions dj_backend_server/web/templates/onboarding/step-0.html
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,7 @@ <h1 class="text-3xl text-slate-800 font-bold mb-6">{% trans "Let's set up your f
</div>
<h3 class="text-lg font-bold text-slate-800 pl-9">{% trans "You provide the system with data" %}</h3>
</div>
<div class="pl-9">{% trans "It could be a website, pdf files, and soon you will have the option to
integrate with many more" %}</div>
<div class="pl-9">{% trans "It could be a website, pdf files, and soon you will have the option to integrate with many more" %}</div>
</li>
<!-- List item -->
<li class="relative py-2">
Expand Down
2 changes: 1 addition & 1 deletion dj_backend_server/web/templates/onboarding/step-2.html
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ <h1 class="text-3xl text-slate-800 font-bold mb-6">{% trans "Website information

<div class="flex items-center justify-between space-x-6 mb-8">
<div>
<div class="font-medium text-slate-800 text-sm mb-1">{% trans "Just to make sure we are on the same page" %} 🫶
<div class="font-medium text-slate-800 text-sm mb-1">{% trans "Just to make sure we are on the same page" %}
</div>
<div class="text-xs">
{% trans "We might not be able to crawl some websites, especially websites that are built using JS (SPA), we are working on adding headless browsing to support all sorts of websites." %}
Expand Down
7 changes: 6 additions & 1 deletion dj_backend_server/web/workers/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from web.signals.website_data_source_crawling_was_completed import website_data_source_crawling_completed
from web.models.crawled_pages import CrawledPages
from web.models.website_data_sources import WebsiteDataSource
from api.utils.init_vector_store import ensure_vector_database_exists
from django.core.files.storage import default_storage
from django.core.files.base import ContentFile
from django.utils.text import slugify
Expand Down Expand Up @@ -35,6 +36,10 @@ def start_recursive_crawler(data_source_id, chatbot_id):
Exception: If any error occurs during the crawling process, the function will catch the exception, set the
crawling status to "failed", and re-raise the exception.
"""
# Ensure vector database exists before starting the crawl

ensure_vector_database_exists(str(chatbot_id))
# print("Starting recursive crawler")
data_source = WebsiteDataSource.objects.get(pk=data_source_id)
root_url = data_source.root_url

Expand Down Expand Up @@ -323,4 +328,4 @@ def crawl(data_source_id, url, crawled_urls, max_pages, chatbot_id):
except Exception as e:
# Handle other exceptions (e.g., invalid HTML, network issues) and continue crawling
logging.exception(f"An unexpected error occurred while crawling URL: {url}")
pass
pass
Loading