From ef7ce95e5f5403394a32bf9f57247f10cedf6c90 Mon Sep 17 00:00:00 2001 From: lvalics Date: Sat, 6 Jan 2024 19:46:33 +0000 Subject: [PATCH 1/3] Will show now on data pages how many URL was parsed and how many PDF. --- dj_backend_server/web/templates/settings-data.html | 8 ++++++-- dj_backend_server/web/views/views_chatbot_settings.py | 5 ++++- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/dj_backend_server/web/templates/settings-data.html b/dj_backend_server/web/templates/settings-data.html index 15d12859..9dac4554 100644 --- a/dj_backend_server/web/templates/settings-data.html +++ b/dj_backend_server/web/templates/settings-data.html @@ -51,8 +51,12 @@

{{bot.name}}: data sou

Data sources

-
Currently we only support web based sources, soon we will support Pdf, - Text and even videos! +
+
  • Website data sources: {{website_data_sources.count}} with crawled pages: {{ crawled_pages_count }}
  • +
  • PDF data sources: {{pdf_data_sources.count}}
  • +
  • Codebase data sources: {{codebase_data_sources.count}}
  • +
+
{% if messages %} diff --git a/dj_backend_server/web/views/views_chatbot_settings.py b/dj_backend_server/web/views/views_chatbot_settings.py index 5e51f2f8..bc178d04 100644 --- a/dj_backend_server/web/views/views_chatbot_settings.py +++ b/dj_backend_server/web/views/views_chatbot_settings.py @@ -81,8 +81,11 @@ def get_history_by_session_id(request, id, session_id): def data_settings(request, id): bot = get_object_or_404(Chatbot, id=id) website_data_sources = WebsiteDataSource.objects.filter(chatbot_id=id).prefetch_related('crawled_pages').order_by('-id') + #website_data_sources = WebsiteDataSource.objects.filter(chatbot_id=id).order_by('-id') pdf_data_sources = PdfDataSource.objects.filter(chatbot_id=id).order_by('-id') codebase_data_sources = CodebaseDataSource.objects.filter(chatbot_id=id).order_by('-id') + crawled_pages_count = CrawledPages.objects.filter(website_data_source__chatbot_id=id).count() + print(f"LEHEL {crawled_pages_count}") for source in pdf_data_sources: merged_files = [] @@ -122,7 +125,7 @@ def data_settings(request, id): source.merged_files = merged_files source.status_html = status_html - return render(request, 'settings-data.html', {'bot': bot, 'website_data_sources': website_data_sources, 'pdf_data_sources': pdf_data_sources, 'codebase_data_sources': codebase_data_sources}) + return render(request, 'settings-data.html', {'bot': bot, 'website_data_sources': website_data_sources, 'pdf_data_sources': pdf_data_sources, 'codebase_data_sources': codebase_data_sources, 'crawled_pages_count': crawled_pages_count,}) def analytics_settings(request, id): From 9be6f112c69d0e21361bfb5b3b45cad4a9eb12aa Mon Sep 17 00:00:00 2001 From: lvalics Date: Sat, 6 Jan 2024 20:17:34 +0000 Subject: [PATCH 2/3] Documented file --- dj_backend_server/web/workers/crawler.py | 130 ++++++++++++++++++++++- 1 file changed, 127 insertions(+), 3 deletions(-) diff --git a/dj_backend_server/web/workers/crawler.py b/dj_backend_server/web/workers/crawler.py index 940b0a13..35779d91 100644 --- a/dj_backend_server/web/workers/crawler.py +++ b/dj_backend_server/web/workers/crawler.py @@ -21,6 +21,20 @@ def start_recursive_crawler(data_source_id, chatbot_id): + """ + This function starts a recursive crawler on a given data source. If the data source's crawling status is already + completed, the function will return immediately. Otherwise, it will set the status to "in progress", start the + crawling process, and handle the completion of the crawling. If any exception occurs during the process, the + crawling status will be set to "failed". + + Args: + data_source_id (int): The ID of the data source to be crawled. This should be a primary key of a WebsiteDataSource object. + chatbot_id (int): The ID of the chatbot initiating the crawl. This is used when handling the completion of the crawling. + + Raises: + Exception: If any error occurs during the crawling process, the function will catch the exception, set the + crawling status to "failed", and re-raise the exception. + """ data_source = WebsiteDataSource.objects.get(pk=data_source_id) root_url = data_source.root_url @@ -44,11 +58,27 @@ def start_recursive_crawler(data_source_id, chatbot_id): data_source.crawling_status = WebsiteDataSourceStatusType.FAILED.value data_source.save() -# the file will be stored in the website_data_sources// directory. def store_crawled_page_content_to_database(url, response, chatbot_id, data_source_id, html): + """ + This function stores the content of a crawled web page to the database. It first normalizes the HTML content, + saves it to a local file in the /website_data_sources directory, extracts the title of the page, and then creates + a CrawledPages object and saves it to the database. If any error occurs during the creation of the + CrawledPages object, the function will print the error message. + + Args: + url (str): The URL of the web page that has been crawled. + response (Response): The response object returned by the request to the URL. + chatbot_id (int): The ID of the chatbot that initiated the crawl. + data_source_id (int): The ID of the data source from which the web page was crawled. + html (str): The HTML content of the web page. + + Raises: + Exception: If any error occurs during the creation of the CrawledPages object, the function will catch the + exception and print the error message. + """ html = get_normalized_content(html) - # Save the HTML content to a local file in /tmp directory + # Save the HTML content to a local file in /website_data_sources directory file_name = str(uuid4()) + ".txt" folder_name = os.path.join("website_data_sources", str(data_source_id)) file_path = os.path.join(folder_name, file_name) @@ -73,6 +103,19 @@ def store_crawled_page_content_to_database(url, response, chatbot_id, data_sourc def calculate_crawling_progress(crawled_pages, max_pages): + """ + This function calculates the progress of the crawling process. It divides the number of crawled pages by the maximum + number of pages that should be crawled, multiplies the result by 100 to get a percentage, and then rounds the result + to two decimal places. If the calculated progress exceeds 100, it is capped at 100. If the maximum number of pages + is zero or less, the function returns 0 to avoid division by zero. + + Args: + crawled_pages (int): The number of pages that have been crawled. + max_pages (int): The maximum number of pages that should be crawled. + + Returns: + float: The progress of the crawling process, as a percentage rounded to two decimal places. + """ if max_pages <= 0: return 0 # Avoid division by zero @@ -81,6 +124,19 @@ def calculate_crawling_progress(crawled_pages, max_pages): return min(progress, 100) def update_crawling_progress(chatbot_id, data_source_id, progress): + """ + This function updates the crawling progress of a specific data source. It retrieves the data source object from + the database using the provided ID, sets its crawling progress to the provided value, and then saves the changes + to the database. If no data source with the provided ID exists, the function does nothing. + + Args: + chatbot_id (int): The ID of the chatbot that initiated the crawl. Currently not used in this function but could be useful for future enhancements. + data_source_id (int): The ID of the data source whose crawling progress should be updated. + progress (float): The new crawling progress, as a percentage. + + Raises: + WebsiteDataSource.DoesNotExist: If no data source with the provided ID exists, the function will catch this exception and do nothing. + """ try: data_source = WebsiteDataSource.objects.get(pk=data_source_id) data_source.crawling_progress = progress @@ -89,8 +145,19 @@ def update_crawling_progress(chatbot_id, data_source_id, progress): pass -# This method needs to be revisited and has to be written in a more sophisticated manner. def get_normalized_content(html): + """ + This function normalizes the HTML content of a web page. It removes unwanted elements (such as 'script' and 'style' tags, + and elements with 'skip', 'menu', or 'dropdown' classes), extracts the text content from the cleaned HTML, removes extra + whitespace between words, and trims leading and trailing whitespace. The function uses BeautifulSoup to parse and manipulate + the HTML, and a recursive function to clean unwanted elements. + + Args: + html (str): The HTML content of the web page to be normalized. + + Returns: + str: The normalized text content of the web page. + """ # Define a list of tags and classes to exclude exclude_elements = ['script', 'style'] exclude_classes = ['skip', 'menu', 'dropdown'] @@ -99,6 +166,16 @@ def get_normalized_content(html): # Function to recursively remove unwanted elements def clean_elements(element): + """ + This function recursively removes unwanted elements from a BeautifulSoup element. It iterates over the direct children + of the provided element. If a child element's tag name is in the 'exclude_elements' list or any of its classes are in the + 'exclude_classes' list, the child element is replaced with a whitespace. Otherwise, the function is called recursively on + the child element. + + Args: + element (bs4.element.Tag): The BeautifulSoup element to clean. This should be a tag object that potentially contains + other tag objects (children). + """ for child in element.find_all(recursive=False): if child.name in exclude_elements or any(cls in child.get('class', []) for cls in exclude_classes): # Replace excluded elements with whitespace @@ -121,6 +198,17 @@ def clean_elements(element): def get_crawled_page_title(html): + """ + This function extracts the title of a web page from its HTML content. It uses BeautifulSoup to parse the HTML and find + the 'title' tag. If a 'title' tag is found, the function returns its text content. If no 'title' tag is found, the function + returns None. @TODO check why title is not saving into database. + + Args: + html (str): The HTML content of the web page from which to extract the title. + + Returns: + str or None: The title of the web page, or None if no 'title' tag is found. + """ # Use BeautifulSoup to parse the HTML and extract the title soup = BeautifulSoup(html, 'html.parser') title_element = soup.find('title') @@ -130,6 +218,18 @@ def get_crawled_page_title(html): def extract_links(html, root_url): + """ + This function extracts all the URLs from the anchor tags in the HTML content of a web page. It uses BeautifulSoup to parse + the HTML and find all 'a' tags. The function then normalizes and filters the URLs to remove query parameters, fragments, and + URLs that do not belong to the same root URL host. + + Args: + html (str): The HTML content of the web page from which to extract the URLs. + root_url (str): The root URL of the web page. This is used to filter out URLs that do not belong to the same host. + + Returns: + list: A list of extracted, normalized, and filtered URLs. + """ # Use BeautifulSoup to parse the HTML and extract all anchor tags soup = BeautifulSoup(html, 'html.parser') anchor_tags = soup.find_all('a') @@ -157,6 +257,23 @@ def extract_links(html, root_url): def crawl(data_source_id, url, crawled_urls, max_pages, chatbot_id): + """ + This function crawls a web page at a given URL and stores its content in the database. It also extracts all the links from + the page and recursively crawls each one. The function keeps track of the URLs that have been crawled to avoid duplicates and + stops crawling when the maximum number of pages has been reached. If an error occurs while crawling a URL, the function logs + the error and continues with the next URL. + + Args: + data_source_id (int): The ID of the data source from which the web page is being crawled. + url (str): The URL of the web page to crawl. + crawled_urls (list): A list of URLs that have already been crawled. This is used to avoid crawling the same URL multiple times. + max_pages (int): The maximum number of pages to crawl. + chatbot_id (int): The ID of the chatbot that initiated the crawl. + + Raises: + requests.exceptions.RequestException: If an error occurs while sending the HTTP GET request to the URL, the function will catch this exception and do nothing. + Exception: If any other error occurs during the crawling process, the function will catch the exception, log the error, and continue with the next URL. + """ # Check if the maximum page limit has been reached if len(crawled_urls) >= max_pages: return @@ -168,6 +285,13 @@ def crawl(data_source_id, url, crawled_urls, max_pages, chatbot_id): # Check if the URL ends with a binary file extension binary_extensions = ['.png', '.jpg', '.jpeg', '.gif', '.bmp', '.ico', '.tif', '.tiff', '.webp'] if any(url.lower().endswith(ext) for ext in binary_extensions): + """ + This block of code checks if the URL ends with a binary file extension. It has a list of binary file extensions + (binary_extensions). The 'any' function is used with a generator expression that checks if the URL (converted to + lower case to ensure case-insensitive matching) ends with any of the extensions in the list. If the URL does end + with a binary file extension, the function immediately returns, effectively skipping the processing of binary files + in the web crawling process. + """ return # Add the current URL to the crawled URLs list From 548250616095e473e5e65d724ee53a5e0e7257f6 Mon Sep 17 00:00:00 2001 From: lvalics Date: Sat, 6 Jan 2024 20:31:23 +0000 Subject: [PATCH 3/3] Document the file --- .../web/listeners/ingest_website_data_source.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/dj_backend_server/web/listeners/ingest_website_data_source.py b/dj_backend_server/web/listeners/ingest_website_data_source.py index 97dfffcf..4b9e7ff6 100644 --- a/dj_backend_server/web/listeners/ingest_website_data_source.py +++ b/dj_backend_server/web/listeners/ingest_website_data_source.py @@ -3,8 +3,22 @@ import requests from requests.exceptions import RequestException -# @website_data_source_crawling_completed.connect def handle_crawling_completed(chatbot_id, website_data_source_id): + """ + This function is triggered when the crawling (crawler.py) of a website data source is completed. It sends a POST request to an ingest + service to start the ingestion of the crawled data. The request body includes the type of the data source ('website'), the + ID of the website data source (used as the 'shared_folder' parameter), and the ID of the chatbot (used as the 'namespace' + parameter). The URL of the ingest service is retrieved from an environment variable ('APP_URL'). If the ingest service + returns a status code other than 200, the function raises an exception. + + Args: + chatbot_id (int): The ID of the chatbot that initiated the crawl. + website_data_source_id (int): The ID of the website data source that has been crawled. + + Raises: + Exception: If the ingest service returns a status code other than 200, the function raises an exception with a message + that includes the response text from the ingest service. + """ request_body = { 'type': 'website', 'shared_folder': str(website_data_source_id),