Skip to content

Commit

Permalink
Merge pull request #219 from lvalics/main
Browse files Browse the repository at this point in the history
Data sources show number of documents.
  • Loading branch information
codebanesr committed Jan 6, 2024
2 parents c834403 + 5482506 commit ed459ac
Show file tree
Hide file tree
Showing 4 changed files with 152 additions and 7 deletions.
16 changes: 15 additions & 1 deletion dj_backend_server/web/listeners/ingest_website_data_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,22 @@
import requests
from requests.exceptions import RequestException

# @website_data_source_crawling_completed.connect
def handle_crawling_completed(chatbot_id, website_data_source_id):
"""
This function is triggered when the crawling (crawler.py) of a website data source is completed. It sends a POST request to an ingest
service to start the ingestion of the crawled data. The request body includes the type of the data source ('website'), the
ID of the website data source (used as the 'shared_folder' parameter), and the ID of the chatbot (used as the 'namespace'
parameter). The URL of the ingest service is retrieved from an environment variable ('APP_URL'). If the ingest service
returns a status code other than 200, the function raises an exception.
Args:
chatbot_id (int): The ID of the chatbot that initiated the crawl.
website_data_source_id (int): The ID of the website data source that has been crawled.
Raises:
Exception: If the ingest service returns a status code other than 200, the function raises an exception with a message
that includes the response text from the ingest service.
"""
request_body = {
'type': 'website',
'shared_folder': str(website_data_source_id),
Expand Down
8 changes: 6 additions & 2 deletions dj_backend_server/web/templates/settings-data.html
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,12 @@ <h1 class="text-2xl md:text-3xl text-slate-800 font-bold">{{bot.name}}: data sou
<div class="p-6 space-y-6">
<section>
<h3 class="text-xl leading-snug text-slate-800 font-bold mb-1">Data sources</h3>
<div class="text-sm">Currently we only support web based sources, soon we will support Pdf,
Text and even videos!
<div class="text-sm">
<ul><li>Website data sources: {{website_data_sources.count}} with crawled pages: {{ crawled_pages_count }} </li>
<li>PDF data sources: {{pdf_data_sources.count}}</li>
<li>Codebase data sources: {{codebase_data_sources.count}} </li>
</ul>
</div>
</div>

{% if messages %}
Expand Down
5 changes: 4 additions & 1 deletion dj_backend_server/web/views/views_chatbot_settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,8 +81,11 @@ def get_history_by_session_id(request, id, session_id):
def data_settings(request, id):
bot = get_object_or_404(Chatbot, id=id)
website_data_sources = WebsiteDataSource.objects.filter(chatbot_id=id).prefetch_related('crawled_pages').order_by('-id')
#website_data_sources = WebsiteDataSource.objects.filter(chatbot_id=id).order_by('-id')
pdf_data_sources = PdfDataSource.objects.filter(chatbot_id=id).order_by('-id')
codebase_data_sources = CodebaseDataSource.objects.filter(chatbot_id=id).order_by('-id')
crawled_pages_count = CrawledPages.objects.filter(website_data_source__chatbot_id=id).count()
print(f"LEHEL {crawled_pages_count}")

for source in pdf_data_sources:
merged_files = []
Expand Down Expand Up @@ -122,7 +125,7 @@ def data_settings(request, id):
source.merged_files = merged_files
source.status_html = status_html

return render(request, 'settings-data.html', {'bot': bot, 'website_data_sources': website_data_sources, 'pdf_data_sources': pdf_data_sources, 'codebase_data_sources': codebase_data_sources})
return render(request, 'settings-data.html', {'bot': bot, 'website_data_sources': website_data_sources, 'pdf_data_sources': pdf_data_sources, 'codebase_data_sources': codebase_data_sources, 'crawled_pages_count': crawled_pages_count,})


def analytics_settings(request, id):
Expand Down
130 changes: 127 additions & 3 deletions dj_backend_server/web/workers/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,20 @@


def start_recursive_crawler(data_source_id, chatbot_id):
"""
This function starts a recursive crawler on a given data source. If the data source's crawling status is already
completed, the function will return immediately. Otherwise, it will set the status to "in progress", start the
crawling process, and handle the completion of the crawling. If any exception occurs during the process, the
crawling status will be set to "failed".
Args:
data_source_id (int): The ID of the data source to be crawled. This should be a primary key of a WebsiteDataSource object.
chatbot_id (int): The ID of the chatbot initiating the crawl. This is used when handling the completion of the crawling.
Raises:
Exception: If any error occurs during the crawling process, the function will catch the exception, set the
crawling status to "failed", and re-raise the exception.
"""
data_source = WebsiteDataSource.objects.get(pk=data_source_id)
root_url = data_source.root_url

Expand All @@ -44,11 +58,27 @@ def start_recursive_crawler(data_source_id, chatbot_id):
data_source.crawling_status = WebsiteDataSourceStatusType.FAILED.value
data_source.save()

# the file will be stored in the website_data_sources/<data_source_id>/ directory.
def store_crawled_page_content_to_database(url, response, chatbot_id, data_source_id, html):
"""
This function stores the content of a crawled web page to the database. It first normalizes the HTML content,
saves it to a local file in the /website_data_sources directory, extracts the title of the page, and then creates
a CrawledPages object and saves it to the database. If any error occurs during the creation of the
CrawledPages object, the function will print the error message.
Args:
url (str): The URL of the web page that has been crawled.
response (Response): The response object returned by the request to the URL.
chatbot_id (int): The ID of the chatbot that initiated the crawl.
data_source_id (int): The ID of the data source from which the web page was crawled.
html (str): The HTML content of the web page.
Raises:
Exception: If any error occurs during the creation of the CrawledPages object, the function will catch the
exception and print the error message.
"""
html = get_normalized_content(html)

# Save the HTML content to a local file in /tmp directory
# Save the HTML content to a local file in /website_data_sources directory
file_name = str(uuid4()) + ".txt"
folder_name = os.path.join("website_data_sources", str(data_source_id))
file_path = os.path.join(folder_name, file_name)
Expand All @@ -73,6 +103,19 @@ def store_crawled_page_content_to_database(url, response, chatbot_id, data_sourc


def calculate_crawling_progress(crawled_pages, max_pages):
"""
This function calculates the progress of the crawling process. It divides the number of crawled pages by the maximum
number of pages that should be crawled, multiplies the result by 100 to get a percentage, and then rounds the result
to two decimal places. If the calculated progress exceeds 100, it is capped at 100. If the maximum number of pages
is zero or less, the function returns 0 to avoid division by zero.
Args:
crawled_pages (int): The number of pages that have been crawled.
max_pages (int): The maximum number of pages that should be crawled.
Returns:
float: The progress of the crawling process, as a percentage rounded to two decimal places.
"""
if max_pages <= 0:
return 0 # Avoid division by zero

Expand All @@ -81,6 +124,19 @@ def calculate_crawling_progress(crawled_pages, max_pages):
return min(progress, 100)

def update_crawling_progress(chatbot_id, data_source_id, progress):
"""
This function updates the crawling progress of a specific data source. It retrieves the data source object from
the database using the provided ID, sets its crawling progress to the provided value, and then saves the changes
to the database. If no data source with the provided ID exists, the function does nothing.
Args:
chatbot_id (int): The ID of the chatbot that initiated the crawl. Currently not used in this function but could be useful for future enhancements.
data_source_id (int): The ID of the data source whose crawling progress should be updated.
progress (float): The new crawling progress, as a percentage.
Raises:
WebsiteDataSource.DoesNotExist: If no data source with the provided ID exists, the function will catch this exception and do nothing.
"""
try:
data_source = WebsiteDataSource.objects.get(pk=data_source_id)
data_source.crawling_progress = progress
Expand All @@ -89,8 +145,19 @@ def update_crawling_progress(chatbot_id, data_source_id, progress):
pass


# This method needs to be revisited and has to be written in a more sophisticated manner.
def get_normalized_content(html):
"""
This function normalizes the HTML content of a web page. It removes unwanted elements (such as 'script' and 'style' tags,
and elements with 'skip', 'menu', or 'dropdown' classes), extracts the text content from the cleaned HTML, removes extra
whitespace between words, and trims leading and trailing whitespace. The function uses BeautifulSoup to parse and manipulate
the HTML, and a recursive function to clean unwanted elements.
Args:
html (str): The HTML content of the web page to be normalized.
Returns:
str: The normalized text content of the web page.
"""
# Define a list of tags and classes to exclude
exclude_elements = ['script', 'style']
exclude_classes = ['skip', 'menu', 'dropdown']
Expand All @@ -99,6 +166,16 @@ def get_normalized_content(html):

# Function to recursively remove unwanted elements
def clean_elements(element):
"""
This function recursively removes unwanted elements from a BeautifulSoup element. It iterates over the direct children
of the provided element. If a child element's tag name is in the 'exclude_elements' list or any of its classes are in the
'exclude_classes' list, the child element is replaced with a whitespace. Otherwise, the function is called recursively on
the child element.
Args:
element (bs4.element.Tag): The BeautifulSoup element to clean. This should be a tag object that potentially contains
other tag objects (children).
"""
for child in element.find_all(recursive=False):
if child.name in exclude_elements or any(cls in child.get('class', []) for cls in exclude_classes):
# Replace excluded elements with whitespace
Expand All @@ -121,6 +198,17 @@ def clean_elements(element):


def get_crawled_page_title(html):
"""
This function extracts the title of a web page from its HTML content. It uses BeautifulSoup to parse the HTML and find
the 'title' tag. If a 'title' tag is found, the function returns its text content. If no 'title' tag is found, the function
returns None. @TODO check why title is not saving into database.
Args:
html (str): The HTML content of the web page from which to extract the title.
Returns:
str or None: The title of the web page, or None if no 'title' tag is found.
"""
# Use BeautifulSoup to parse the HTML and extract the title
soup = BeautifulSoup(html, 'html.parser')
title_element = soup.find('title')
Expand All @@ -130,6 +218,18 @@ def get_crawled_page_title(html):


def extract_links(html, root_url):
"""
This function extracts all the URLs from the anchor tags in the HTML content of a web page. It uses BeautifulSoup to parse
the HTML and find all 'a' tags. The function then normalizes and filters the URLs to remove query parameters, fragments, and
URLs that do not belong to the same root URL host.
Args:
html (str): The HTML content of the web page from which to extract the URLs.
root_url (str): The root URL of the web page. This is used to filter out URLs that do not belong to the same host.
Returns:
list: A list of extracted, normalized, and filtered URLs.
"""
# Use BeautifulSoup to parse the HTML and extract all anchor tags
soup = BeautifulSoup(html, 'html.parser')
anchor_tags = soup.find_all('a')
Expand Down Expand Up @@ -157,6 +257,23 @@ def extract_links(html, root_url):


def crawl(data_source_id, url, crawled_urls, max_pages, chatbot_id):
"""
This function crawls a web page at a given URL and stores its content in the database. It also extracts all the links from
the page and recursively crawls each one. The function keeps track of the URLs that have been crawled to avoid duplicates and
stops crawling when the maximum number of pages has been reached. If an error occurs while crawling a URL, the function logs
the error and continues with the next URL.
Args:
data_source_id (int): The ID of the data source from which the web page is being crawled.
url (str): The URL of the web page to crawl.
crawled_urls (list): A list of URLs that have already been crawled. This is used to avoid crawling the same URL multiple times.
max_pages (int): The maximum number of pages to crawl.
chatbot_id (int): The ID of the chatbot that initiated the crawl.
Raises:
requests.exceptions.RequestException: If an error occurs while sending the HTTP GET request to the URL, the function will catch this exception and do nothing.
Exception: If any other error occurs during the crawling process, the function will catch the exception, log the error, and continue with the next URL.
"""
# Check if the maximum page limit has been reached
if len(crawled_urls) >= max_pages:
return
Expand All @@ -168,6 +285,13 @@ def crawl(data_source_id, url, crawled_urls, max_pages, chatbot_id):
# Check if the URL ends with a binary file extension
binary_extensions = ['.png', '.jpg', '.jpeg', '.gif', '.bmp', '.ico', '.tif', '.tiff', '.webp']
if any(url.lower().endswith(ext) for ext in binary_extensions):
"""
This block of code checks if the URL ends with a binary file extension. It has a list of binary file extensions
(binary_extensions). The 'any' function is used with a generator expression that checks if the URL (converted to
lower case to ensure case-insensitive matching) ends with any of the extensions in the list. If the URL does end
with a binary file extension, the function immediately returns, effectively skipping the processing of binary files
in the web crawling process.
"""
return

# Add the current URL to the crawled URLs list
Expand Down

0 comments on commit ed459ac

Please sign in to comment.