From 11be0c843701f98f05a64d2ca2078cd87bb3209f Mon Sep 17 00:00:00 2001 From: lvalics Date: Thu, 26 Oct 2023 20:06:03 +0000 Subject: [PATCH 1/6] Started to work on NGINX replacer for yourdomain.com from APP_URL --- dj_backend_server/Dockerfile.nginx | 7 +++++++ dj_backend_server/docker-compose.linux.yaml | 6 ++++++ dj_backend_server/entrypoint-nginx.sh | 16 ++++++++++++++++ 3 files changed, 29 insertions(+) create mode 100644 dj_backend_server/Dockerfile.nginx create mode 100644 dj_backend_server/entrypoint-nginx.sh diff --git a/dj_backend_server/Dockerfile.nginx b/dj_backend_server/Dockerfile.nginx new file mode 100644 index 00000000..34821296 --- /dev/null +++ b/dj_backend_server/Dockerfile.nginx @@ -0,0 +1,7 @@ +# Use the official Nginx image +FROM nginx + +COPY ./nginx/nginx.conf /etc/nginx/nginx.conf.template +COPY ./entrypoint-nginx.sh /entrypoint-nginx.sh +RUN chmod +x /entrypoint-nginx.sh +ENTRYPOINT ["/entrypoint-nginx.sh"] \ No newline at end of file diff --git a/dj_backend_server/docker-compose.linux.yaml b/dj_backend_server/docker-compose.linux.yaml index 430aa2de..7dc23c44 100644 --- a/dj_backend_server/docker-compose.linux.yaml +++ b/dj_backend_server/docker-compose.linux.yaml @@ -35,6 +35,9 @@ services: nginx: image: nginx container_name: oc_nginx + build: + context: . + dockerfile: Dockerfile.nginx restart: unless-stopped ports: - "80:80" @@ -46,6 +49,9 @@ services: - ./static:/app/web/static/ networks: - openchat_network + env_file: + - .env.docker + #entrypoint: ["/entrypoint-nginx.sh"] depends_on: - qdrant - mysql diff --git a/dj_backend_server/entrypoint-nginx.sh b/dj_backend_server/entrypoint-nginx.sh new file mode 100644 index 00000000..b7088b72 --- /dev/null +++ b/dj_backend_server/entrypoint-nginx.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +# Remove 'http://' or 'https://' prefix from APP_URL +CLEANED_APP_URL=${APP_URL#http://} +CLEANED_APP_URL=${APP_URL#https://} + +echo "Replacing APP_URL with $CLEANED_APP_URL" + +# Define the file path as a variable, for example: +NGINX_CONF="/etc/nginx/nginx.conf" + +sed "s|yourdomain.com|$CLEANED_APP_URL|g" NGINX_CONF > /tmp/nginx.conf +mv /tmp/nginx.conf NGINX_CONF + +# Start your app normally +# exec nginx -g "daemon off;" From 4ce1983a03b845166014bd9654e48364aa804637 Mon Sep 17 00:00:00 2001 From: lvalics Date: Thu, 26 Oct 2023 20:50:53 +0000 Subject: [PATCH 2/6] CORS issue fiexed. --- .../api/middleware/cors_middleware.py | 26 +++++++++++++++++++ .../dj_backend_server/settings.py | 1 + dj_backend_server/nginx/nginx.conf | 2 +- 3 files changed, 28 insertions(+), 1 deletion(-) create mode 100644 dj_backend_server/api/middleware/cors_middleware.py diff --git a/dj_backend_server/api/middleware/cors_middleware.py b/dj_backend_server/api/middleware/cors_middleware.py new file mode 100644 index 00000000..7526c814 --- /dev/null +++ b/dj_backend_server/api/middleware/cors_middleware.py @@ -0,0 +1,26 @@ +from django.utils.deprecation import MiddlewareMixin +from web.models.chatbot import Chatbot + +class CorsMiddleware(MiddlewareMixin): + def process_response(self, request, response): + # Get the origin of the request + origin = request.META.get('HTTP_ORIGIN') + + # Check if the origin is in the database + origin_in_db = Chatbot.objects.filter(website=origin).exists() + print(f"Origin of the request: {origin}") + print(f"Is the origin in the database: {origin_in_db}") + + if origin_in_db: + # Add the 'Access-Control-Allow-Origin' header to the response + response['Access-Control-Allow-Origin'] = origin + response['Access-Control-Allow-Methods'] = 'GET, POST, OPTIONS' + response['Access-Control-Allow-Headers'] = 'X-Requested-With, Content-Type, X-Bot-Token' + + print(f"Website URLs checked: {[chatbot.website for chatbot in Chatbot.objects.all()]}") + print(f"Response status code: {response.status_code}") + print(f"Response content: {response.content}") + print(f"Response headers: {response.headers}") + + + return response \ No newline at end of file diff --git a/dj_backend_server/dj_backend_server/settings.py b/dj_backend_server/dj_backend_server/settings.py index 324358ee..7de66824 100644 --- a/dj_backend_server/dj_backend_server/settings.py +++ b/dj_backend_server/dj_backend_server/settings.py @@ -62,6 +62,7 @@ MIDDLEWARE = [ 'django.middleware.locale.LocaleMiddleware', 'django.middleware.security.SecurityMiddleware', + 'api.middleware.cors_middleware.CorsMiddleware', 'django.contrib.sessions.middleware.SessionMiddleware', 'django.middleware.common.CommonMiddleware', 'django.middleware.csrf.CsrfViewMiddleware', diff --git a/dj_backend_server/nginx/nginx.conf b/dj_backend_server/nginx/nginx.conf index 5252aa86..e23fe1e7 100644 --- a/dj_backend_server/nginx/nginx.conf +++ b/dj_backend_server/nginx/nginx.conf @@ -110,7 +110,7 @@ http { proxy_set_header X-Real-IP $remote_addr; proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; proxy_set_header X-Forwarded-Proto $scheme; # Forward the original scheme (HTTP or HTTPS) - proxy_set_header Origin ""; # Optionally forward the Origin header + proxy_set_header Origin $http_origin; # Optionally forward the Origin header proxy_ssl_protocols TLSv1 TLSv1.1 TLSv1.2 TLSv1.3; add_header Cache-Control "public, max-age=2592000"; add_header Strict-Transport-Security "max-age=31536000; includeSubDomains" always; # HSTS header From 03669038a055e3338f4204349656b44a5b7074f5 Mon Sep 17 00:00:00 2001 From: lvalics Date: Thu, 26 Oct 2023 20:51:17 +0000 Subject: [PATCH 3/6] CORS issue fiexed 2. --- dj_backend_server/api/middleware/cors_middleware.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/dj_backend_server/api/middleware/cors_middleware.py b/dj_backend_server/api/middleware/cors_middleware.py index 7526c814..83a3fa6b 100644 --- a/dj_backend_server/api/middleware/cors_middleware.py +++ b/dj_backend_server/api/middleware/cors_middleware.py @@ -8,8 +8,8 @@ def process_response(self, request, response): # Check if the origin is in the database origin_in_db = Chatbot.objects.filter(website=origin).exists() - print(f"Origin of the request: {origin}") - print(f"Is the origin in the database: {origin_in_db}") + # print(f"Origin of the request: {origin}") + # print(f"Is the origin in the database: {origin_in_db}") if origin_in_db: # Add the 'Access-Control-Allow-Origin' header to the response @@ -17,10 +17,10 @@ def process_response(self, request, response): response['Access-Control-Allow-Methods'] = 'GET, POST, OPTIONS' response['Access-Control-Allow-Headers'] = 'X-Requested-With, Content-Type, X-Bot-Token' - print(f"Website URLs checked: {[chatbot.website for chatbot in Chatbot.objects.all()]}") - print(f"Response status code: {response.status_code}") - print(f"Response content: {response.content}") - print(f"Response headers: {response.headers}") + # print(f"Website URLs checked: {[chatbot.website for chatbot in Chatbot.objects.all()]}") + # print(f"Response status code: {response.status_code}") + # print(f"Response content: {response.content}") + # print(f"Response headers: {response.headers}") return response \ No newline at end of file From 476cbd79f8ec8f03de50896b27ea675efd207e95 Mon Sep 17 00:00:00 2001 From: lvalics Date: Fri, 27 Oct 2023 12:53:28 +0000 Subject: [PATCH 4/6] Added extra settings to handle large files and some small tweaking usually I use on nginx. Also in cors_middleware added to check also APP_URL --- .../api/middleware/cors_middleware.py | 12 ++-- dj_backend_server/nginx/nginx.conf | 56 +++++++++++++++++++ 2 files changed, 64 insertions(+), 4 deletions(-) diff --git a/dj_backend_server/api/middleware/cors_middleware.py b/dj_backend_server/api/middleware/cors_middleware.py index 83a3fa6b..8cc01863 100644 --- a/dj_backend_server/api/middleware/cors_middleware.py +++ b/dj_backend_server/api/middleware/cors_middleware.py @@ -1,5 +1,6 @@ from django.utils.deprecation import MiddlewareMixin from web.models.chatbot import Chatbot +import os class CorsMiddleware(MiddlewareMixin): def process_response(self, request, response): @@ -7,11 +8,14 @@ def process_response(self, request, response): origin = request.META.get('HTTP_ORIGIN') # Check if the origin is in the database - origin_in_db = Chatbot.objects.filter(website=origin).exists() - # print(f"Origin of the request: {origin}") - # print(f"Is the origin in the database: {origin_in_db}") + # Get APP_URL from environment variables + app_url = os.getenv('APP_URL') + #print(f"Origin of the APP_URL: {app_url} == {origin}") - if origin_in_db: + # Check if the origin is in the database or equal to APP_URL + origin_in_db = origin == app_url or Chatbot.objects.filter(website=origin).exists() + + if origin_in_db: # Add the 'Access-Control-Allow-Origin' header to the response response['Access-Control-Allow-Origin'] = origin response['Access-Control-Allow-Methods'] = 'GET, POST, OPTIONS' diff --git a/dj_backend_server/nginx/nginx.conf b/dj_backend_server/nginx/nginx.conf index e23fe1e7..d2e755a9 100644 --- a/dj_backend_server/nginx/nginx.conf +++ b/dj_backend_server/nginx/nginx.conf @@ -9,6 +9,34 @@ http { listen 80; server_name yourdomain.com; # Replace with your domain name or IP address + # Duplicate your existing settings here + charset utf-8; + + keepalive_timeout 500; + keepalive_requests 5000; + + client_max_body_size 64m; + client_body_buffer_size 64m; + + sendfile on; + server_tokens off; + + tcp_nopush on; + tcp_nodelay on; + reset_timedout_connection on; + + gzip on; + gzip_comp_level 5; + gzip_min_length 256; + gzip_proxied any; + gzip_types application/javascript application/json application/xml text/css text/plain text/xml; + gzip_vary on; + + open_file_cache max=1000 inactive=20s; + open_file_cache_valid 30s; + open_file_cache_min_uses 2; + open_file_cache_errors on; + location /static { proxy_pass https://web:8000; expires -1; #dev env @@ -73,6 +101,34 @@ http { text/html html; } + # Duplicate your existing settings here + charset utf-8; + + keepalive_timeout 500; + keepalive_requests 5000; + + client_max_body_size 64m; + client_body_buffer_size 64m; + + sendfile on; + server_tokens off; + + tcp_nopush on; + tcp_nodelay on; + reset_timedout_connection on; + + gzip on; + gzip_comp_level 5; + gzip_min_length 256; + gzip_proxied any; + gzip_types application/javascript application/json application/xml text/css text/plain text/xml; + gzip_vary on; + + open_file_cache max=1000 inactive=20s; + open_file_cache_valid 30s; + open_file_cache_min_uses 2; + open_file_cache_errors on; + # location /static/ { # alias /app/web/static/; # The trailing slash is important # # proxy_set_header Host $host; From 97881a69562f5e7d48f3ac5eafe1cf2c0d813a0c Mon Sep 17 00:00:00 2001 From: lvalics Date: Fri, 27 Oct 2023 13:15:19 +0000 Subject: [PATCH 5/6] CSRF fix in settings.py --- .../api/middleware/cors_middleware.py | 6 ++---- dj_backend_server/dj_backend_server/settings.py | 14 +++++++++----- .../web/templates/onboarding/step-2-pdf.html | 9 ++++++++- 3 files changed, 19 insertions(+), 10 deletions(-) diff --git a/dj_backend_server/api/middleware/cors_middleware.py b/dj_backend_server/api/middleware/cors_middleware.py index 8cc01863..a423ca64 100644 --- a/dj_backend_server/api/middleware/cors_middleware.py +++ b/dj_backend_server/api/middleware/cors_middleware.py @@ -21,10 +21,8 @@ def process_response(self, request, response): response['Access-Control-Allow-Methods'] = 'GET, POST, OPTIONS' response['Access-Control-Allow-Headers'] = 'X-Requested-With, Content-Type, X-Bot-Token' - # print(f"Website URLs checked: {[chatbot.website for chatbot in Chatbot.objects.all()]}") + #print(f"Website URLs checked: {[chatbot.website for chatbot in Chatbot.objects.all()]}") # print(f"Response status code: {response.status_code}") # print(f"Response content: {response.content}") - # print(f"Response headers: {response.headers}") - - + #print(f"Response headers: {response.headers}") return response \ No newline at end of file diff --git a/dj_backend_server/dj_backend_server/settings.py b/dj_backend_server/dj_backend_server/settings.py index 7de66824..a330e9a9 100644 --- a/dj_backend_server/dj_backend_server/settings.py +++ b/dj_backend_server/dj_backend_server/settings.py @@ -69,7 +69,7 @@ 'django.contrib.auth.middleware.AuthenticationMiddleware', 'django.contrib.messages.middleware.MessageMiddleware', 'django.middleware.clickjacking.XFrameOptionsMiddleware', - 'corsheaders.middleware.CorsMiddleware', + #'corsheaders.middleware.CorsMiddleware', ] ROOT_URLCONF = 'dj_backend_server.urls' @@ -184,13 +184,17 @@ SESSION_ENGINE = 'django.contrib.sessions.backends.db' # You can choose other engines as well -#ALLOWED_HOSTS = [ -# 'localhost', -# '0.0.0.0', -#] ALLOWED_HOSTS = os.environ.get('ALLOWED_HOSTS', '0.0.0.0').split(',') APP_URL = os.environ.get('APP_URL', 'http://0.0.0.0:8000') CORS_ALLOWED_ORIGINS = [ APP_URL, +] + +CSRF_TRUSTED_ORIGINS = [ + APP_URL, +] + +CSRF_COOKIE_DOMAIN = [ + APP_URL, ] \ No newline at end of file diff --git a/dj_backend_server/web/templates/onboarding/step-2-pdf.html b/dj_backend_server/web/templates/onboarding/step-2-pdf.html index b7273295..7d54b0d4 100644 --- a/dj_backend_server/web/templates/onboarding/step-2-pdf.html +++ b/dj_backend_server/web/templates/onboarding/step-2-pdf.html @@ -110,7 +110,14 @@

Upload PDF files as sources
- +
+
+ + +
+
Make sure that your files are scannable (text not images) 🫶
From ace43d88fe9c5f536b0451c87d518ec5c6eb75bf Mon Sep 17 00:00:00 2001 From: lvalics Date: Fri, 27 Oct 2023 17:58:03 +0000 Subject: [PATCH 6/6] Fixed DB structure of crawled page. Now pages are written into DB and show in Dashboard. --- .../0006_crawledpages_content_file.py | 18 ++++++++++++++++++ .../migrations/0007_alter_crawledpages_id.py | 18 ++++++++++++++++++ dj_backend_server/web/models/crawled_pages.py | 4 ++-- 3 files changed, 38 insertions(+), 2 deletions(-) create mode 100644 dj_backend_server/web/migrations/0006_crawledpages_content_file.py create mode 100644 dj_backend_server/web/migrations/0007_alter_crawledpages_id.py diff --git a/dj_backend_server/web/migrations/0006_crawledpages_content_file.py b/dj_backend_server/web/migrations/0006_crawledpages_content_file.py new file mode 100644 index 00000000..189f67cd --- /dev/null +++ b/dj_backend_server/web/migrations/0006_crawledpages_content_file.py @@ -0,0 +1,18 @@ +# Generated by Django 4.2.3 on 2023-10-27 17:32 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('web', '0005_pdfdatasourceerrorlog'), + ] + + operations = [ + migrations.AddField( + model_name='crawledpages', + name='content_file', + field=models.CharField(max_length=255, null=True), + ), + ] diff --git a/dj_backend_server/web/migrations/0007_alter_crawledpages_id.py b/dj_backend_server/web/migrations/0007_alter_crawledpages_id.py new file mode 100644 index 00000000..2952029b --- /dev/null +++ b/dj_backend_server/web/migrations/0007_alter_crawledpages_id.py @@ -0,0 +1,18 @@ +# Generated by Django 4.2.3 on 2023-10-27 17:49 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('web', '0006_crawledpages_content_file'), + ] + + operations = [ + migrations.AlterField( + model_name='crawledpages', + name='id', + field=models.AutoField(primary_key=True, serialize=False), + ), + ] diff --git a/dj_backend_server/web/models/crawled_pages.py b/dj_backend_server/web/models/crawled_pages.py index fd9069a3..70a0d64e 100644 --- a/dj_backend_server/web/models/crawled_pages.py +++ b/dj_backend_server/web/models/crawled_pages.py @@ -4,7 +4,7 @@ from web.models.chatbot import Chatbot class CrawledPages(models.Model): - id = models.CharField(max_length=36, primary_key=True) + id = models.AutoField(primary_key=True) chatbot_id = models.CharField(max_length=36, null=True) website_data_source = models.ForeignKey(WebsiteDataSource, on_delete=models.CASCADE, related_name='crawled_pages') url = models.CharField(max_length=255) @@ -13,7 +13,7 @@ class CrawledPages(models.Model): aws_url = models.TextField(null=True) created_at = models.DateTimeField(auto_now_add=True) updated_at = models.DateTimeField(auto_now=True) - # content_file= models.CharField(max_length=100) + content_file= models.CharField(max_length=255, null=True) def get_id(self): return self.id