Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[front] feature: Use dust-app for parsing table headers #7543

Draft
wants to merge 27 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
0a22f75
Use dust-app for parsing table headers
tdraier Sep 3, 2024
086faaf
update app id
tdraier Sep 3, 2024
54b63e6
small fixes
tdraier Sep 3, 2024
c99462d
Merge branch 'main' into table-headers
tdraier Sep 3, 2024
339c0da
Merge branch 'main' into table-headers
tdraier Sep 4, 2024
a310c13
Merge branch 'main' into table-headers
tdraier Sep 6, 2024
235675b
Merge branch 'main' into table-headers
tdraier Sep 8, 2024
a5f020e
Moved headers sanitize to upsert activity, add feature flag
tdraier Sep 9, 2024
71f0113
Merge branch 'main' into table-headers
tdraier Sep 9, 2024
8670e6d
add optional param in endpoints
tdraier Sep 11, 2024
a066312
Merge branch 'main' into table-headers
tdraier Sep 11, 2024
901bec1
clean import
tdraier Sep 11, 2024
9463b65
Merge branch 'main' into table-headers
tdraier Sep 11, 2024
4e6a97e
Handle google spreadsheets
tdraier Sep 11, 2024
de4dced
Merge branch 'main' into table-headers
tdraier Sep 15, 2024
c4c3408
fixes
tdraier Sep 15, 2024
90e85b8
removed unused error
tdraier Sep 16, 2024
6a56eb0
restore sanitize limit, allow undefined param
tdraier Sep 16, 2024
b37c2e6
Default to false
tdraier Sep 16, 2024
6632e4c
Merge branch 'main' into table-headers
tdraier Sep 17, 2024
5c5e834
review
tdraier Sep 17, 2024
287b3a3
Add logging
tdraier Sep 19, 2024
387a8b6
Merge branch 'main' into table-headers
tdraier Sep 19, 2024
9ab9637
Merge branch 'main' into table-headers
tdraier Sep 19, 2024
335975e
ensure api compat
tdraier Sep 19, 2024
d3bc2c0
Update app
tdraier Sep 19, 2024
dbb5c2b
Merge branch 'main' into table-headers
tdraier Sep 19, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 11 additions & 33 deletions connectors/src/connectors/google_drive/temporal/spreadsheets.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import type { ModelId } from "@dust-tt/types";
import {
getGoogleSheetTableId,
getSanitizedHeaders,
InvalidStructuredDataHeaderError,
slugify,
} from "@dust-tt/types";
Expand Down Expand Up @@ -80,38 +79,29 @@ async function upsertTable(
},
truncate: true,
parents: [tableId, ...parents],
useAppForHeaderDetection: true,
});

logger.info(loggerArgs, "[Spreadsheet] Table upserted.");
}

function findDataRangeAndSelectRows(allRows: string[][]): string[][] {
// Find the first row with data to determine the range.
const firstNonEmptyRow = allRows.find((row) =>
const nonEmptyRow = allRows.filter((row) =>
row.some((cell) => cell.trim() !== "")
);
if (!firstNonEmptyRow) {
return []; // No data found.
}

// Identify the range of data: Start at the first non-empty cell and end at the nearest following empty cell or row end.
const startIndex = firstNonEmptyRow.findIndex((cell) => cell.trim() !== "");
let endIndex = firstNonEmptyRow.findIndex(
(cell, idx) => idx > startIndex && cell.trim() === ""
);
if (endIndex === -1) {
endIndex = firstNonEmptyRow.length;
}

// Select only rows and columns within the data range.
return allRows
.map((row) => row.slice(startIndex, endIndex))
.filter((row) => row.some((cell) => cell.trim() !== ""));
return nonEmptyRow;
}

function getValidRows(allRows: string[][], loggerArgs: object): string[][] {
const filteredRows = findDataRangeAndSelectRows(allRows);

const maxCols = filteredRows.reduce(
(acc, row) => (row.length > acc ? row.length : acc),
0
);

// We assume that the first row is always the headers.
// Headers are used to assert the number of cells per row.
const [rawHeaders] = filteredRows;
Expand All @@ -124,25 +114,13 @@ function getValidRows(allRows: string[][], loggerArgs: object): string[][] {
}

try {
const headers = getSanitizedHeaders(rawHeaders);

const validRows: string[][] = filteredRows.map((row, index) => {
// Return raw headers.
if (index === 0) {
return headers;
}

const validRows: string[][] = filteredRows.map((row) => {
// If a row has less cells than headers, we fill the gap with empty strings.
if (row.length < headers.length) {
const shortfall = headers.length - row.length;
if (row.length < maxCols) {
const shortfall = maxCols - row.length;
return [...row, ...Array(shortfall).fill("")];
}

// If a row has more cells than headers we truncate the row.
if (row.length > headers.length) {
return row.slice(0, headers.length);
}

return row;
});

Expand Down
5 changes: 3 additions & 2 deletions connectors/src/connectors/microsoft/temporal/spreadsheets.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import type { Result } from "@dust-tt/types";
import { Err, getSanitizedHeaders, Ok, slugify } from "@dust-tt/types";
import { Err, Ok, slugify } from "@dust-tt/types";
import type { Client } from "@microsoft/microsoft-graph-client";
import { stringify } from "csv-stringify/sync";

Expand Down Expand Up @@ -99,6 +99,7 @@ async function upsertTable(
},
truncate: true,
parents,
useAppForHeaderDetection: true,
});

logger.info(loggerArgs, "[Spreadsheet] Table upserted.");
Expand Down Expand Up @@ -181,7 +182,7 @@ async function processSheet({

// Assuming the first line as headers, at least one additional data line is required.
if (rawHeaders && rows.length > 1) {
const headers = getSanitizedHeaders(rawHeaders);
const headers = rawHeaders;

const parents = [
worksheetInternalId,
Expand Down
8 changes: 2 additions & 6 deletions connectors/src/connectors/notion/lib/notion_api.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,7 @@ import type {
ParsedNotionPage,
PropertyKeys,
} from "@dust-tt/types";
import {
assertNever,
cacheWithRedis,
getSanitizedHeaders,
} from "@dust-tt/types";
import { assertNever, cacheWithRedis } from "@dust-tt/types";
import type { LogLevel } from "@notionhq/client";
import {
APIResponseError,
Expand Down Expand Up @@ -1009,7 +1005,7 @@ export async function renderDatabaseFromPages({
)
);

const sanitizedHeaders = getSanitizedHeaders(header);
const sanitizedHeaders = header;

let csv = await new Promise<string>((resolve, reject) => {
stringify(
Expand Down
3 changes: 3 additions & 0 deletions connectors/src/lib/data_sources.ts
Original file line number Diff line number Diff line change
Expand Up @@ -563,6 +563,7 @@ export async function upsertTableFromCsv({
loggerArgs,
truncate,
parents,
useAppForHeaderDetection,
}: {
dataSourceConfig: DataSourceConfig;
tableId: string;
Expand All @@ -572,6 +573,7 @@ export async function upsertTableFromCsv({
loggerArgs?: Record<string, string | number>;
truncate: boolean;
parents: string[];
useAppForHeaderDetection?: boolean;
}) {
const localLogger = logger.child({ ...loggerArgs, tableId, tableName });
const statsDTags = [
Expand Down Expand Up @@ -599,6 +601,7 @@ export async function upsertTableFromCsv({
tableId,
truncate,
async: true,
useAppForHeaderDetection,
};
const dustRequestConfig: AxiosRequestConfig = {
headers: {
Expand Down
32 changes: 29 additions & 3 deletions front/components/data_source/DocumentOrTableUploadOrEditModal.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import {
Modal,
Page,
PlusIcon,
SparklesIcon,
Spinner,
TrashIcon,
} from "@dust-tt/sparkle";
Expand All @@ -17,8 +18,8 @@ import type {
CoreAPILightDocument,
DataSourceViewType,
LightContentNode,
LightWorkspaceType,
PlanType,
WorkspaceType,
} from "@dust-tt/types";
import {
BIG_FILE_SIZE,
Expand All @@ -42,7 +43,7 @@ interface DocumentOrTableUploadOrEditModalProps {
dataSourceView: DataSourceViewType;
isOpen: boolean;
onClose: (save: boolean) => void;
owner: LightWorkspaceType;
owner: WorkspaceType;
plan: PlanType;
totalNodesCount: number;
viewType: ContentNodesViewType;
Expand Down Expand Up @@ -98,7 +99,8 @@ export function DocumentOrTableUploadOrEditModal({
const [uploading, setUploading] = useState(false);
const [isBigFile, setIsBigFile] = useState(false);
const [developerOptionsVisible, setDeveloperOptionsVisible] = useState(false);

const [useAppForHeaderDetection, setUseAppForHeaderDetection] =
useState(false);
const isTable = viewType == "tables";
const initialId = contentNode?.internalId;

Expand Down Expand Up @@ -195,6 +197,7 @@ export function DocumentOrTableUploadOrEditModal({
parents: [],
truncate: false,
async: false,
useAppForHeaderDetection,
});

const res = await fetch(endpoint, {
Expand Down Expand Up @@ -478,6 +481,29 @@ export function DocumentOrTableUploadOrEditModal({
)}
</div>

{isTable &&
owner.flags.includes("use_app_for_header_detection") && (
<div>
<Page.SectionHeader
title="Enable header detection"
description={
"Use the LLM model to detect headers in the CSV file."
}
action={{
label: useAppForHeaderDetection ? "Disable" : "Enable",
variant: useAppForHeaderDetection
? "primary"
: "tertiary",
icon: SparklesIcon,
onClick: () =>
setUseAppForHeaderDetection(
!useAppForHeaderDetection
),
}}
/>
</div>
)}

{!isTable && (
<div>
<Page.SectionHeader
Expand Down
3 changes: 2 additions & 1 deletion front/lib/api/config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,8 @@ const config = {
url:
EnvironmentConfig.getOptionalEnvVariable("DUST_PROD_API") ??
PRODUCTION_DUST_API,
nodeEnv: EnvironmentConfig.getEnvVariable("NODE_ENV"),
nodeEnv:
EnvironmentConfig.getOptionalEnvVariable("NODE_ENV") || "development",
};
},
getOAuthAPIConfig: (): { url: string; apiKey: string | null } => {
Expand Down
14 changes: 13 additions & 1 deletion front/lib/api/data_sources.ts
Original file line number Diff line number Diff line change
Expand Up @@ -367,6 +367,7 @@ export async function upsertTable({
async,
dataSource,
auth,
useAppForHeaderDetection,
}: {
tableId?: string | null;
name: string;
Expand All @@ -379,6 +380,7 @@ export async function upsertTable({
async: boolean;
dataSource: DataSourceResource;
auth: Authenticator;
useAppForHeaderDetection?: boolean;
}) {
const nonNullTableId = tableId ?? generateLegacyModelSId();
const tableParents: string[] = parents ?? [];
Expand All @@ -387,9 +389,17 @@ export async function upsertTable({
tableParents.push(nonNullTableId);
}

const useAppForHeaderDetectionFlag = auth
.getNonNullableWorkspace()
.flags.includes("use_app_for_header_detection");

const useApp = !!useAppForHeaderDetection && useAppForHeaderDetectionFlag;

if (async) {
// Ensure the CSV is valid before enqueuing the upsert.
const csvRowsRes = csv ? await rowsFromCsv(csv) : null;
const csvRowsRes = csv
? await rowsFromCsv({ auth, csv, useAppForHeaderDetection: useApp })
: null;
if (csvRowsRes?.isErr()) {
return csvRowsRes;
}
Expand All @@ -406,6 +416,7 @@ export async function upsertTable({
tableParents,
csv: csv ?? null,
truncate,
useAppForHeaderDetection: useApp,
},
});
if (enqueueRes.isErr()) {
Expand All @@ -426,6 +437,7 @@ export async function upsertTable({
tableParents,
csv: csv ?? null,
truncate,
useAppForHeaderDetection: useApp,
});

return tableRes;
Expand Down
Loading
Loading