Skip to content

Commit

Permalink
Revert "[front] feature: Use dust-app for parsing table headers (#7077)"
Browse files Browse the repository at this point in the history
This reverts commit 722579d.
  • Loading branch information
tdraier committed Sep 19, 2024
1 parent bc4a96e commit 98599e4
Show file tree
Hide file tree
Showing 17 changed files with 116 additions and 233 deletions.
44 changes: 33 additions & 11 deletions connectors/src/connectors/google_drive/temporal/spreadsheets.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import type { ModelId } from "@dust-tt/types";
import {
getGoogleSheetTableId,
getSanitizedHeaders,
InvalidStructuredDataHeaderError,
slugify,
} from "@dust-tt/types";
Expand Down Expand Up @@ -79,29 +80,38 @@ async function upsertTable(
},
truncate: true,
parents: [tableId, ...parents],
useAppForHeaderDetection: true,
});

logger.info(loggerArgs, "[Spreadsheet] Table upserted.");
}

function findDataRangeAndSelectRows(allRows: string[][]): string[][] {
// Find the first row with data to determine the range.
const nonEmptyRow = allRows.filter((row) =>
const firstNonEmptyRow = allRows.find((row) =>
row.some((cell) => cell.trim() !== "")
);
if (!firstNonEmptyRow) {
return []; // No data found.
}

// Identify the range of data: Start at the first non-empty cell and end at the nearest following empty cell or row end.
const startIndex = firstNonEmptyRow.findIndex((cell) => cell.trim() !== "");
let endIndex = firstNonEmptyRow.findIndex(
(cell, idx) => idx > startIndex && cell.trim() === ""
);
if (endIndex === -1) {
endIndex = firstNonEmptyRow.length;
}

return nonEmptyRow;
// Select only rows and columns within the data range.
return allRows
.map((row) => row.slice(startIndex, endIndex))
.filter((row) => row.some((cell) => cell.trim() !== ""));
}

function getValidRows(allRows: string[][], loggerArgs: object): string[][] {
const filteredRows = findDataRangeAndSelectRows(allRows);

const maxCols = filteredRows.reduce(
(acc, row) => (row.length > acc ? row.length : acc),
0
);

// We assume that the first row is always the headers.
// Headers are used to assert the number of cells per row.
const [rawHeaders] = filteredRows;
Expand All @@ -114,13 +124,25 @@ function getValidRows(allRows: string[][], loggerArgs: object): string[][] {
}

try {
const validRows: string[][] = filteredRows.map((row) => {
const headers = getSanitizedHeaders(rawHeaders);

const validRows: string[][] = filteredRows.map((row, index) => {
// Return raw headers.
if (index === 0) {
return headers;
}

// If a row has less cells than headers, we fill the gap with empty strings.
if (row.length < maxCols) {
const shortfall = maxCols - row.length;
if (row.length < headers.length) {
const shortfall = headers.length - row.length;
return [...row, ...Array(shortfall).fill("")];
}

// If a row has more cells than headers we truncate the row.
if (row.length > headers.length) {
return row.slice(0, headers.length);
}

return row;
});

Expand Down
5 changes: 2 additions & 3 deletions connectors/src/connectors/microsoft/temporal/spreadsheets.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import type { Result } from "@dust-tt/types";
import { Err, Ok, slugify } from "@dust-tt/types";
import { Err, getSanitizedHeaders, Ok, slugify } from "@dust-tt/types";
import type { Client } from "@microsoft/microsoft-graph-client";
import { stringify } from "csv-stringify/sync";

Expand Down Expand Up @@ -99,7 +99,6 @@ async function upsertTable(
},
truncate: true,
parents,
useAppForHeaderDetection: true,
});

logger.info(loggerArgs, "[Spreadsheet] Table upserted.");
Expand Down Expand Up @@ -182,7 +181,7 @@ async function processSheet({

// Assuming the first line as headers, at least one additional data line is required.
if (rawHeaders && rows.length > 1) {
const headers = rawHeaders;
const headers = getSanitizedHeaders(rawHeaders);

const parents = [
worksheetInternalId,
Expand Down
8 changes: 6 additions & 2 deletions connectors/src/connectors/notion/lib/notion_api.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,11 @@ import type {
ParsedNotionPage,
PropertyKeys,
} from "@dust-tt/types";
import { assertNever, cacheWithRedis } from "@dust-tt/types";
import {
assertNever,
cacheWithRedis,
getSanitizedHeaders,
} from "@dust-tt/types";
import type { LogLevel } from "@notionhq/client";
import {
APIResponseError,
Expand Down Expand Up @@ -1005,7 +1009,7 @@ export async function renderDatabaseFromPages({
)
);

const sanitizedHeaders = header;
const sanitizedHeaders = getSanitizedHeaders(header);

let csv = await new Promise<string>((resolve, reject) => {
stringify(
Expand Down
3 changes: 0 additions & 3 deletions connectors/src/lib/data_sources.ts
Original file line number Diff line number Diff line change
Expand Up @@ -563,7 +563,6 @@ export async function upsertTableFromCsv({
loggerArgs,
truncate,
parents,
useAppForHeaderDetection,
}: {
dataSourceConfig: DataSourceConfig;
tableId: string;
Expand All @@ -573,7 +572,6 @@ export async function upsertTableFromCsv({
loggerArgs?: Record<string, string | number>;
truncate: boolean;
parents: string[];
useAppForHeaderDetection?: boolean;
}) {
const localLogger = logger.child({ ...loggerArgs, tableId, tableName });
const statsDTags = [
Expand Down Expand Up @@ -601,7 +599,6 @@ export async function upsertTableFromCsv({
tableId,
truncate,
async: true,
useAppForHeaderDetection,
};
const dustRequestConfig: AxiosRequestConfig = {
headers: {
Expand Down
32 changes: 3 additions & 29 deletions front/components/data_source/DocumentOrTableUploadOrEditModal.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ import {
Modal,
Page,
PlusIcon,
SparklesIcon,
Spinner,
TrashIcon,
} from "@dust-tt/sparkle";
Expand All @@ -18,8 +17,8 @@ import type {
CoreAPILightDocument,
DataSourceViewType,
LightContentNode,
LightWorkspaceType,
PlanType,
WorkspaceType,
} from "@dust-tt/types";
import {
BIG_FILE_SIZE,
Expand All @@ -43,7 +42,7 @@ interface DocumentOrTableUploadOrEditModalProps {
dataSourceView: DataSourceViewType;
isOpen: boolean;
onClose: (save: boolean) => void;
owner: WorkspaceType;
owner: LightWorkspaceType;
plan: PlanType;
totalNodesCount: number;
viewType: ContentNodesViewType;
Expand Down Expand Up @@ -99,8 +98,7 @@ export function DocumentOrTableUploadOrEditModal({
const [uploading, setUploading] = useState(false);
const [isBigFile, setIsBigFile] = useState(false);
const [developerOptionsVisible, setDeveloperOptionsVisible] = useState(false);
const [useAppForHeaderDetection, setUseAppForHeaderDetection] =
useState(false);

const isTable = viewType == "tables";
const initialId = contentNode?.internalId;

Expand Down Expand Up @@ -197,7 +195,6 @@ export function DocumentOrTableUploadOrEditModal({
parents: [],
truncate: false,
async: false,
useAppForHeaderDetection,
});

const res = await fetch(endpoint, {
Expand Down Expand Up @@ -481,29 +478,6 @@ export function DocumentOrTableUploadOrEditModal({
)}
</div>

{isTable &&
owner.flags.includes("use_app_for_header_detection") && (
<div>
<Page.SectionHeader
title="Enable header detection"
description={
"Use the LLM model to detect headers in the CSV file."
}
action={{
label: useAppForHeaderDetection ? "Disable" : "Enable",
variant: useAppForHeaderDetection
? "primary"
: "tertiary",
icon: SparklesIcon,
onClick: () =>
setUseAppForHeaderDetection(
!useAppForHeaderDetection
),
}}
/>
</div>
)}

{!isTable && (
<div>
<Page.SectionHeader
Expand Down
3 changes: 1 addition & 2 deletions front/lib/api/config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -82,8 +82,7 @@ const config = {
url:
EnvironmentConfig.getOptionalEnvVariable("DUST_PROD_API") ??
PRODUCTION_DUST_API,
nodeEnv:
EnvironmentConfig.getOptionalEnvVariable("NODE_ENV") || "development",
nodeEnv: EnvironmentConfig.getEnvVariable("NODE_ENV"),
};
},
getOAuthAPIConfig: (): { url: string; apiKey: string | null } => {
Expand Down
14 changes: 1 addition & 13 deletions front/lib/api/data_sources.ts
Original file line number Diff line number Diff line change
Expand Up @@ -367,7 +367,6 @@ export async function upsertTable({
async,
dataSource,
auth,
useAppForHeaderDetection,
}: {
tableId?: string | null;
name: string;
Expand All @@ -380,7 +379,6 @@ export async function upsertTable({
async: boolean;
dataSource: DataSourceResource;
auth: Authenticator;
useAppForHeaderDetection?: boolean;
}) {
const nonNullTableId = tableId ?? generateLegacyModelSId();
const tableParents: string[] = parents ?? [];
Expand All @@ -389,17 +387,9 @@ export async function upsertTable({
tableParents.push(nonNullTableId);
}

const useAppForHeaderDetectionFlag = auth
.getNonNullableWorkspace()
.flags.includes("use_app_for_header_detection");

const useApp = !!useAppForHeaderDetection && useAppForHeaderDetectionFlag;

if (async) {
// Ensure the CSV is valid before enqueuing the upsert.
const csvRowsRes = csv
? await rowsFromCsv({ auth, csv, useAppForHeaderDetection: useApp })
: null;
const csvRowsRes = csv ? await rowsFromCsv(csv) : null;
if (csvRowsRes?.isErr()) {
return csvRowsRes;
}
Expand All @@ -416,7 +406,6 @@ export async function upsertTable({
tableParents,
csv: csv ?? null,
truncate,
useAppForHeaderDetection: useApp,
},
});
if (enqueueRes.isErr()) {
Expand All @@ -437,7 +426,6 @@ export async function upsertTable({
tableParents,
csv: csv ?? null,
truncate,
useAppForHeaderDetection: useApp,
});

return tableRes;
Expand Down
Loading

0 comments on commit 98599e4

Please sign in to comment.