openchatai · codebanesr · Feb 15, 2024 · Feb 7, 2024 · Feb 13, 2024 · Feb 13, 2024
diff --git a/dj_backend_server/CHANGELOG.MD b/dj_backend_server/CHANGELOG.MD
@@ -0,0 +1,6 @@
+2.14.2024
+- Added example.env to streamline environment setup.
+- Implemented translation fixes to enhance application localization.
+- Updated docker-compose.yaml to prefix each container with oc_ for better namespace management.
+- Performed fixes in requirements.txt for improved dependency resolution.
+- Ensured existence of Vector Database (QDrant) prior to web crawling operations to address issues encountered with large websites, ensuring Vector Database creation and availability.
diff --git a/dj_backend_server/api/data_sources/pdf_handler.py b/dj_backend_server/api/data_sources/pdf_handler.py
@@ -222,15 +222,17 @@ def txt_to_vectordb(shared_folder: str, namespace: str, delete_folder_flag: bool
 
         docs = text_splitter.split_documents(raw_docs)
 
-        print("external files docs -->", docs);
+        # print("external files docs -->", docs);
 
         if not docs:
              print("No documents were processed successfully.")
              return
 
         embeddings = get_embeddings()
 
+        print(f"Initializing vector store for namespace: {namespace} with {len(docs)} documents.")
         init_vector_store(docs, embeddings, StoreOptions(namespace=namespace))
+        print(f"Vector store initialized successfully for namespace: {namespace}.")
 
         print(f'Folder need or not to delete. {delete_folder_flag}')
         # Delete folder if flag is set
@@ -243,7 +245,7 @@ def txt_to_vectordb(shared_folder: str, namespace: str, delete_folder_flag: bool
         # pdf_data_source.save()
         failed_job = FailedJob(uuid=str(uuid4()), connection='default', queue='default', payload='txt_to_vectordb', exception=str(e),failed_at=timezone.now())
         failed_job.save()
-        print(e)
+        print(f"Failed to initialize vector store for namespace: {namespace}. Exception: {e}")
         traceback.print_exc()
 
 

diff --git a/dj_backend_server/api/utils/init_vector_store.py b/dj_backend_server/api/utils/init_vector_store.py
@@ -128,3 +128,31 @@ def delete_from_vector_store(namespace: str, filter_criteria: dict) -> None:
     else:
         raise NotImplementedError(f"Delete operation is not implemented for the store type: {store_type}")
 
+
+def ensure_vector_database_exists(namespace):
+    store_type = StoreType[os.environ['STORE']]
+    try:
+        if store_type == StoreType.QDRANT:
+            client = QdrantClient(url=os.environ['QDRANT_URL'])
+            for attempt in range(3):
+                existing_collections = client.get_collections().collections
+                if namespace not in existing_collections:
+                    print(f"Namespace '{namespace}' does not exist. Attempting to create.")
+                    vectors_config = models.VectorParams(
+                        size=1536,  # Using 1536-dimensional vectors, adjust as necessary
+                        distance=models.Distance.COSINE  # Using cosine distance, adjust as necessary
+                    )
+                    client.create_collection(collection_name=namespace, vectors_config=vectors_config)
+                    # Recheck if the namespace was successfully created
+                    if namespace in client.get_collections().collections:
+                        print(f"Namespace '{namespace}' successfully created.")
+                        return
+                    else:
+                        print(f"Failed to create namespace '{namespace}' on attempt {attempt + 1}.")
+                else:
+                    print(f"Namespace '{namespace}' exists.")
+                    return
+            raise Exception(f"Failed to ensure or create namespace '{namespace}' after 3 attempts.")
+    except Exception as e:
+        print(f"Failed to ensure vector database exists for namespace {namespace}: {e}")
+
diff --git a/dj_backend_server/docker-compose.yaml b/dj_backend_server/docker-compose.yaml
@@ -3,7 +3,7 @@ version: '3.9'
 services:
   mysql:
     restart: unless-stopped
-    platform: linux/arm64/v8
+    container_name: oc_mysql
     image: "mysql:8"
     ports:
       - "3307:3306"
@@ -20,6 +20,7 @@ services:
 
   qdrant:
     image: qdrant/qdrant
+    container_name: oc_qdrant
     ports:
       - 6333:6333
       - 6334:6334
@@ -32,7 +33,7 @@ services:
     build:
       context: .
       dockerfile: Dockerfile
-    container_name: web
+    container_name: oc_web
     ports:
       - "8001:8000"
     volumes:
@@ -53,6 +54,7 @@ services:
 
   adminer:
     image: adminer
+    container_name: oc_adminer
     ports:
       - "8080:8080"
     environment:
@@ -66,7 +68,7 @@ services:
     build:
       context: .
       dockerfile: Dockerfile
-    container_name: celery
+    container_name: oc_celery
     volumes:
       - ./website_data_sources:/app/website_data_sources
     # - ./llama-2-7b-chat.ggmlv3.q4_K_M.bin:/app/llama-2-7b-chat.ggmlv3.q4_K_M.bin:ro
@@ -80,7 +82,7 @@ services:
 
   redis:
     image: redis:latest
-    container_name: redis_cache
+    container_name: oc_redis_cache
     ports:
       - "6379:6379"
     volumes:

diff --git a/dj_backend_server/example.env b/dj_backend_server/example.env
@@ -0,0 +1,75 @@
+##########################################################
+
+# Edit values for your site.
+# your app secret key
+SECRET_KEY='ADD-YOUR-CUSTOM-KEY-HERE'
+# For openai
+OPENAI_API_KEY=YOURKEY
+# add IP what you allow like superadmin
+ALLOWED_HOSTS=localhost,0.0.0.0
+# Use * only in dev environment
+#ALLOWED_HOSTS=* 
+# Your SITE URL
+APP_URL='https://YOUR-URL-HERE'
+
+##########################################################
+
+# "azure" | "openai" | llama2
+OPENAI_API_TYPE=openai
+OPENAI_API_MODEL=gpt-4-1106-preview
+OPENAI_API_TEMPERATURE=1
+
+# If using azure
+# AZURE_OPENAI_API_BASE=
+# AZURE_OPENAI_API_KEY=
+# AZURE_OPENAI_API_VERSION=2023-03-15-preview
+# AZURE_OPENAI_EMBEDDING_MODEL_NAME=
+# AZURE_OPENAI_DEPLOYMENT_NAME=
+# AZURE_OPENAI_COMPLETION_MODEL=gpt-35-turbo
+
+# "azure" | "openai" | llama2
+EMBEDDING_PROVIDER=openai
+
+# Vector Store, PINECONE|QDRANT
+STORE=QDRANT
+
+
+# if using pinecone
+# PINECONE_API_KEY=
+# PINECONE_ENV=
+# VECTOR_STORE_INDEX_NAME=
+
+
+# if using qdrant
+QDRANT_URL=http://qdrant:6333
+
+
+# optional, defaults to 15
+MAX_PAGES_CRAWL=150
+
+# --- these will change if you decide to start testing the software
+CELERY_BROKER_URL=redis://redis:6379/
+CELERY_RESULT_BACKEND=redis://redis:6379/
+DATABASE_NAME=openchat
+DATABASE_USER=dbuser
+DATABASE_PASSWORD=dbpass
+DATABASE_HOST=mysql
+DATABASE_PORT=3306
+
+# use 'external' if you want to use below services.
+PDF_LIBRARY = 'external'
+
+#PDF API - OCRWebService.com (REST API). https://www.ocrwebservice.com/api/restguide
+#Extract text from scanned images and PDF documents and convert into editable formats.
+#Please create new account with ocrwebservice.com via http://www.ocrwebservice.com/account/signup and get license code
+OCR_LICCODE = 'LICENSE-CODE'
+OCR_USERNAME =  'USERNAME'
+OCR_LANGUAGE = 'english'
+# Advantage to clean up the OCR text which can be messy and full with garbage, but will generate a cost with LLM if is paid. Use carefully.
+# Use 1 to enable, 0 to disable.
+OCR_LLM = '1'
+
+# retrieval_qa | conversation_retrieval, retrieval_qa works better with azure openai
+# if you want to use the conversation_retrieval | retrieval_qa chain
+CHAIN_TYPE=conversation_retrieval
+
diff --git a/dj_backend_server/requirements.txt b/dj_backend_server/requirements.txt
@@ -19,8 +19,7 @@ click-repl==0.3.0
 cryptography==41.0.3
 dataclasses-json==0.5.14
 Django==4.2.3
-django-rest-swagger
-djangorestframework
+django-rest-swagger==2.2.0
 dnspython==2.4.1
 drf-spectacular==0.27.1
 drf_spectacular.extensions==0.0.2
@@ -31,8 +30,8 @@ grpcio-tools==1.56.2
 h11==0.14.0
 h2==4.1.0
 hpack==4.0.0
-httpcore1.0.2
-httpx=0.25.2
+httpcore==1.0.2
+httpx==0.25.2
 hyperframe==6.0.1
 idna==3.6
 kombu==5.3.1

diff --git a/dj_backend_server/web/templates/onboarding/other-data-sources-website.html b/dj_backend_server/web/templates/onboarding/other-data-sources-website.html
@@ -79,20 +79,16 @@ <h1 class="text-3xl text-slate-800 font-bold mb-6">{% trans 'Website information
 
                     <div class="flex items-center justify-between space-x-6 mb-8">
                         <div>
-                            <div class="font-medium text-slate-800 text-sm mb-1">{% trans "Just to make sure we are on
-                                the same page" %}</div>
+                            <div class="font-medium text-slate-800 text-sm mb-1">{% trans "Just to make sure we are on the same page" %}</div>
                             <div class="text-xs">
-                                {% trans "Sometimes, we might face challenges when trying to crawl certain websites,
-                                especially the ones built using JavaScript (Single-Page Applications). However, we're
-                                currently working on adding headless browsing to our system so that we can support all
-                                kinds of websites." %}
+                                {% trans "Sometimes, we might face challenges when trying to crawl certain websites, especially the ones built using JavaScript (Single-Page Applications). However, we're currently working on adding headless browsing to our system so that we can support all kinds of websites." %}
                             </div>
                         </div>
                     </div>
                 </div>
                 <div class="flex items-center justify-between">
                     <a class="text-sm underline hover:no-underline" href="{% url 'onboarding.data-source' %}">&lt;- {% trans "Back" %}</a>
-                    <button type="submit" class="btn bg-indigo-500 hover:bg-indigo-600 text-white ml-auto">{% trans "Next Step" %}</button>
+                    <button type="submit" class="btn bg-primary text-white py-2 px-3">{% trans "Next Step" %}</button>
                 </div>
             </form>
         </div>

diff --git a/dj_backend_server/web/templates/onboarding/step-0.html b/dj_backend_server/web/templates/onboarding/step-0.html
@@ -56,8 +56,7 @@ <h1 class="text-3xl text-slate-800 font-bold mb-6">{% trans "Let's set up your f
                                 </div>
                                 <h3 class="text-lg font-bold text-slate-800 pl-9">{% trans "You provide the system with data" %}</h3>
                             </div>
-                            <div class="pl-9">{% trans "It could be a website, pdf files, and soon you will have the option to
-                                integrate with many more" %}</div>
+                            <div class="pl-9">{% trans "It could be a website, pdf files, and soon you will have the option to integrate with many more" %}</div>
                         </li>
                         <!-- List item -->
                         <li class="relative py-2">

diff --git a/dj_backend_server/web/templates/onboarding/step-2.html b/dj_backend_server/web/templates/onboarding/step-2.html
@@ -78,7 +78,7 @@ <h1 class="text-3xl text-slate-800 font-bold mb-6">{% trans "Website information
 
                     <div class="flex items-center justify-between space-x-6 mb-8">
                         <div>
-                            <div class="font-medium text-slate-800 text-sm mb-1">{% trans "Just to make sure we are on the same page" %} 🫶
+                            <div class="font-medium text-slate-800 text-sm mb-1">{% trans "Just to make sure we are on the same page" %} 
                             </div>
                             <div class="text-xs">
                                 {% trans "We might not be able to crawl some websites, especially websites that are built using JS (SPA), we are working on adding headless browsing to support all sorts of websites." %}

diff --git a/dj_backend_server/web/workers/crawler.py b/dj_backend_server/web/workers/crawler.py
@@ -5,6 +5,7 @@
 from web.signals.website_data_source_crawling_was_completed import website_data_source_crawling_completed 
 from web.models.crawled_pages import CrawledPages
 from web.models.website_data_sources import WebsiteDataSource
+from api.utils.init_vector_store import ensure_vector_database_exists
 from django.core.files.storage import default_storage
 from django.core.files.base import ContentFile
 from django.utils.text import slugify
@@ -35,6 +36,10 @@ def start_recursive_crawler(data_source_id, chatbot_id):
         Exception: If any error occurs during the crawling process, the function will catch the exception, set the
         crawling status to "failed", and re-raise the exception.
     """
+    # Ensure vector database exists before starting the crawl
+
+    ensure_vector_database_exists(str(chatbot_id))
+    # print("Starting recursive crawler")
     data_source = WebsiteDataSource.objects.get(pk=data_source_id)
     root_url = data_source.root_url
 
@@ -323,4 +328,4 @@ def crawl(data_source_id, url, crawled_urls, max_pages, chatbot_id):
     except Exception as e:
         # Handle other exceptions (e.g., invalid HTML, network issues) and continue crawling
         logging.exception(f"An unexpected error occurred while crawling URL: {url}")
-        pass
+        pass