feat: new Columnar upload form and API (#28192)

This commit is contained in:
Daniel Vaz Gaspar 2024-05-06 15:51:42 +01:00 committed by GitHub
parent f5843fe588
commit 9a339f08a7
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
29 changed files with 2267 additions and 1232 deletions

View File

@ -138,7 +138,6 @@
"use-event-callback": "^0.1.0",
"use-immer": "^0.9.0",
"use-query-params": "^1.1.9",
"xlsx": "^0.18.5",
"yargs": "^17.7.2"
},
"devDependencies": {
@ -25339,14 +25338,6 @@
"node": ">= 0.12.0"
}
},
"node_modules/adler-32": {
"version": "1.3.1",
"resolved": "https://registry.npmjs.org/adler-32/-/adler-32-1.3.1.tgz",
"integrity": "sha512-ynZ4w/nUUv5rrsR8UUGoe1VC9hZj6V5hU9Qw1HlMDJGEJw5S7TfTErWTjMys6M7vr0YWcPqs3qAr4ss0nDfP+A==",
"engines": {
"node": ">=0.8"
}
},
"node_modules/agent-base": {
"version": "6.0.2",
"resolved": "https://registry.npmjs.org/agent-base/-/agent-base-6.0.2.tgz",
@ -28096,18 +28087,6 @@
"url": "https://github.com/sponsors/wooorm"
}
},
"node_modules/cfb": {
"version": "1.2.2",
"resolved": "https://registry.npmjs.org/cfb/-/cfb-1.2.2.tgz",
"integrity": "sha512-KfdUZsSOw19/ObEWasvBP/Ac4reZvAGauZhs6S/gqNhXhI7cKwvlH7ulj+dOEYnca4bm4SGo8C1bTAQvnTjgQA==",
"dependencies": {
"adler-32": "~1.3.0",
"crc-32": "~1.2.0"
},
"engines": {
"node": ">=0.8"
}
},
"node_modules/chainsaw": {
"version": "0.1.0",
"resolved": "https://registry.npmjs.org/chainsaw/-/chainsaw-0.1.0.tgz",
@ -28906,14 +28885,6 @@
"node": ">=0.10.0"
}
},
"node_modules/codepage": {
"version": "1.15.0",
"resolved": "https://registry.npmjs.org/codepage/-/codepage-1.15.0.tgz",
"integrity": "sha512-3g6NUTPd/YtuuGrhMnOMRjFc+LJw/bnMp3+0r/Wcz3IXUuCosKRJvMphm5+Q+bvTVGcJJuRvVLuYba+WojaFaA==",
"engines": {
"node": ">=0.8"
}
},
"node_modules/collect-v8-coverage": {
"version": "1.0.1",
"resolved": "https://registry.npmjs.org/collect-v8-coverage/-/collect-v8-coverage-1.0.1.tgz",
@ -29885,17 +29856,6 @@
"node": ">=8"
}
},
"node_modules/crc-32": {
"version": "1.2.2",
"resolved": "https://registry.npmjs.org/crc-32/-/crc-32-1.2.2.tgz",
"integrity": "sha512-ROmzCKrTnOwybPcJApAA6WBWij23HVfGVNKqqrZpuyZOHqK2CwHSvpGuyt/UNNvaIjEd8X5IFGp4Mh+Ie1IHJQ==",
"bin": {
"crc32": "bin/crc32.njs"
},
"engines": {
"node": ">=0.8"
}
},
"node_modules/create-emotion": {
"version": "10.0.27",
"resolved": "https://registry.npmjs.org/create-emotion/-/create-emotion-10.0.27.tgz",
@ -37175,14 +37135,6 @@
"node": ">=12.20.0"
}
},
"node_modules/frac": {
"version": "1.1.2",
"resolved": "https://registry.npmjs.org/frac/-/frac-1.1.2.tgz",
"integrity": "sha512-w/XBfkibaTl3YDqASwfDUqkna4Z2p9cFSr1aHDt0WoMTECnRfBOv2WArlZILlqgWlmdIlALXGpM2AOhEk5W3IA==",
"engines": {
"node": ">=0.8"
}
},
"node_modules/fragment-cache": {
"version": "0.2.1",
"resolved": "https://registry.npmjs.org/fragment-cache/-/fragment-cache-0.2.1.tgz",
@ -62001,17 +61953,6 @@
"resolved": "https://registry.npmjs.org/sprintf-js/-/sprintf-js-1.0.3.tgz",
"integrity": "sha1-BOaSb2YolTVPPdAVIDYzuFcpfiw="
},
"node_modules/ssf": {
"version": "0.11.2",
"resolved": "https://registry.npmjs.org/ssf/-/ssf-0.11.2.tgz",
"integrity": "sha512-+idbmIXoYET47hH+d7dfm2epdOMUDjqcB4648sTZ+t2JwoyBFL/insLfB/racrDmsKB3diwsDA696pZMieAC5g==",
"dependencies": {
"frac": "~1.1.2"
},
"engines": {
"node": ">=0.8"
}
},
"node_modules/sshpk": {
"version": "1.15.2",
"resolved": "https://registry.npmjs.org/sshpk/-/sshpk-1.15.2.tgz",
@ -66195,22 +66136,6 @@
"integrity": "sha512-JcKqAHLPxcdb9KM49dufGXn2x3ssnfjbcaQdLlfZsL9rH9wgDQjUtDxbo8NE0F6SFvydeu1VhZe7hZuHsB2/pw==",
"dev": true
},
"node_modules/wmf": {
"version": "1.0.2",
"resolved": "https://registry.npmjs.org/wmf/-/wmf-1.0.2.tgz",
"integrity": "sha512-/p9K7bEh0Dj6WbXg4JG0xvLQmIadrner1bi45VMJTfnbVHsc7yIajZyoSoK60/dtVBs12Fm6WkUI5/3WAVsNMw==",
"engines": {
"node": ">=0.8"
}
},
"node_modules/word": {
"version": "0.3.0",
"resolved": "https://registry.npmjs.org/word/-/word-0.3.0.tgz",
"integrity": "sha512-OELeY0Q61OXpdUfTp+oweA/vtLVg5VDOXh+3he3PNzLGG/y0oylSOC1xRVj0+l4vQ3tj/bB1HVHv1ocXkQceFA==",
"engines": {
"node": ">=0.8"
}
},
"node_modules/wordwrap": {
"version": "1.0.0",
"resolved": "https://registry.npmjs.org/wordwrap/-/wordwrap-1.0.0.tgz",
@ -66525,26 +66450,6 @@
"url": "https://opencollective.com/node-fetch"
}
},
"node_modules/xlsx": {
"version": "0.18.5",
"resolved": "https://registry.npmjs.org/xlsx/-/xlsx-0.18.5.tgz",
"integrity": "sha512-dmg3LCjBPHZnQp5/F/+nnTa+miPJxUXB6vtk42YjBBKayDNagxGEeIdWApkYPOf3Z3pm3k62Knjzp7lMeTEtFQ==",
"dependencies": {
"adler-32": "~1.3.0",
"cfb": "~1.2.1",
"codepage": "~1.15.0",
"crc-32": "~1.2.1",
"ssf": "~0.11.2",
"wmf": "~1.0.1",
"word": "~0.3.0"
},
"bin": {
"xlsx": "bin/xlsx.njs"
},
"engines": {
"node": ">=0.8"
}
},
"node_modules/xml-name-validator": {
"version": "3.0.0",
"resolved": "https://registry.npmjs.org/xml-name-validator/-/xml-name-validator-3.0.0.tgz",
@ -91924,11 +91829,6 @@
"integrity": "sha512-aT6camzM4xEA54YVJYSqxz1kv4IHnQZRtThJJHhUMRExaU5spC7jX5ugSwTaTgJliIgs4VhZOk7htClvQ/LmRA==",
"dev": true
},
"adler-32": {
"version": "1.3.1",
"resolved": "https://registry.npmjs.org/adler-32/-/adler-32-1.3.1.tgz",
"integrity": "sha512-ynZ4w/nUUv5rrsR8UUGoe1VC9hZj6V5hU9Qw1HlMDJGEJw5S7TfTErWTjMys6M7vr0YWcPqs3qAr4ss0nDfP+A=="
},
"agent-base": {
"version": "6.0.2",
"resolved": "https://registry.npmjs.org/agent-base/-/agent-base-6.0.2.tgz",
@ -94034,15 +93934,6 @@
"resolved": "https://registry.npmjs.org/ccount/-/ccount-2.0.1.tgz",
"integrity": "sha512-eyrF0jiFpY+3drT6383f1qhkbGsLSifNAjA61IUjZjmLCWjItY6LB9ft9YhoDgwfmclB2zhu51Lc7+95b8NRAg=="
},
"cfb": {
"version": "1.2.2",
"resolved": "https://registry.npmjs.org/cfb/-/cfb-1.2.2.tgz",
"integrity": "sha512-KfdUZsSOw19/ObEWasvBP/Ac4reZvAGauZhs6S/gqNhXhI7cKwvlH7ulj+dOEYnca4bm4SGo8C1bTAQvnTjgQA==",
"requires": {
"adler-32": "~1.3.0",
"crc-32": "~1.2.0"
}
},
"chainsaw": {
"version": "0.1.0",
"resolved": "https://registry.npmjs.org/chainsaw/-/chainsaw-0.1.0.tgz",
@ -94654,11 +94545,6 @@
"resolved": "https://registry.npmjs.org/code-point-at/-/code-point-at-1.1.0.tgz",
"integrity": "sha1-DQcLTQQ6W+ozovGkDi7bPZpMz3c="
},
"codepage": {
"version": "1.15.0",
"resolved": "https://registry.npmjs.org/codepage/-/codepage-1.15.0.tgz",
"integrity": "sha512-3g6NUTPd/YtuuGrhMnOMRjFc+LJw/bnMp3+0r/Wcz3IXUuCosKRJvMphm5+Q+bvTVGcJJuRvVLuYba+WojaFaA=="
},
"collect-v8-coverage": {
"version": "1.0.1",
"resolved": "https://registry.npmjs.org/collect-v8-coverage/-/collect-v8-coverage-1.0.1.tgz",
@ -95426,11 +95312,6 @@
}
}
},
"crc-32": {
"version": "1.2.2",
"resolved": "https://registry.npmjs.org/crc-32/-/crc-32-1.2.2.tgz",
"integrity": "sha512-ROmzCKrTnOwybPcJApAA6WBWij23HVfGVNKqqrZpuyZOHqK2CwHSvpGuyt/UNNvaIjEd8X5IFGp4Mh+Ie1IHJQ=="
},
"create-emotion": {
"version": "10.0.27",
"resolved": "https://registry.npmjs.org/create-emotion/-/create-emotion-10.0.27.tgz",
@ -101104,11 +100985,6 @@
"fetch-blob": "^3.1.2"
}
},
"frac": {
"version": "1.1.2",
"resolved": "https://registry.npmjs.org/frac/-/frac-1.1.2.tgz",
"integrity": "sha512-w/XBfkibaTl3YDqASwfDUqkna4Z2p9cFSr1aHDt0WoMTECnRfBOv2WArlZILlqgWlmdIlALXGpM2AOhEk5W3IA=="
},
"fragment-cache": {
"version": "0.2.1",
"resolved": "https://registry.npmjs.org/fragment-cache/-/fragment-cache-0.2.1.tgz",
@ -119976,14 +119852,6 @@
"resolved": "https://registry.npmjs.org/sprintf-js/-/sprintf-js-1.0.3.tgz",
"integrity": "sha1-BOaSb2YolTVPPdAVIDYzuFcpfiw="
},
"ssf": {
"version": "0.11.2",
"resolved": "https://registry.npmjs.org/ssf/-/ssf-0.11.2.tgz",
"integrity": "sha512-+idbmIXoYET47hH+d7dfm2epdOMUDjqcB4648sTZ+t2JwoyBFL/insLfB/racrDmsKB3diwsDA696pZMieAC5g==",
"requires": {
"frac": "~1.1.2"
}
},
"sshpk": {
"version": "1.15.2",
"resolved": "https://registry.npmjs.org/sshpk/-/sshpk-1.15.2.tgz",
@ -123149,16 +123017,6 @@
"integrity": "sha512-JcKqAHLPxcdb9KM49dufGXn2x3ssnfjbcaQdLlfZsL9rH9wgDQjUtDxbo8NE0F6SFvydeu1VhZe7hZuHsB2/pw==",
"dev": true
},
"wmf": {
"version": "1.0.2",
"resolved": "https://registry.npmjs.org/wmf/-/wmf-1.0.2.tgz",
"integrity": "sha512-/p9K7bEh0Dj6WbXg4JG0xvLQmIadrner1bi45VMJTfnbVHsc7yIajZyoSoK60/dtVBs12Fm6WkUI5/3WAVsNMw=="
},
"word": {
"version": "0.3.0",
"resolved": "https://registry.npmjs.org/word/-/word-0.3.0.tgz",
"integrity": "sha512-OELeY0Q61OXpdUfTp+oweA/vtLVg5VDOXh+3he3PNzLGG/y0oylSOC1xRVj0+l4vQ3tj/bB1HVHv1ocXkQceFA=="
},
"wordwrap": {
"version": "1.0.0",
"resolved": "https://registry.npmjs.org/wordwrap/-/wordwrap-1.0.0.tgz",
@ -123398,20 +123256,6 @@
}
}
},
"xlsx": {
"version": "0.18.5",
"resolved": "https://registry.npmjs.org/xlsx/-/xlsx-0.18.5.tgz",
"integrity": "sha512-dmg3LCjBPHZnQp5/F/+nnTa+miPJxUXB6vtk42YjBBKayDNagxGEeIdWApkYPOf3Z3pm3k62Knjzp7lMeTEtFQ==",
"requires": {
"adler-32": "~1.3.0",
"cfb": "~1.2.1",
"codepage": "~1.15.0",
"crc-32": "~1.2.1",
"ssf": "~0.11.2",
"wmf": "~1.0.1",
"word": "~0.3.0"
}
},
"xml-name-validator": {
"version": "3.0.0",
"resolved": "https://registry.npmjs.org/xml-name-validator/-/xml-name-validator-3.0.0.tgz",

View File

@ -204,7 +204,6 @@
"use-event-callback": "^0.1.0",
"use-immer": "^0.9.0",
"use-query-params": "^1.1.9",
"xlsx": "^0.18.5",
"yargs": "^17.7.2"
},
"devDependencies": {

View File

@ -29,6 +29,7 @@ import { forEach } from 'lodash';
fetchMock.post('glob:*api/v1/database/1/csv_upload/', {});
fetchMock.post('glob:*api/v1/database/1/excel_upload/', {});
fetchMock.post('glob:*api/v1/database/1/columnar_upload/', {});
fetchMock.get(
'glob:*api/v1/database/?q=(filters:!((col:allow_file_upload,opr:eq,value:!t)),page:0,page_size:100)',
@ -68,6 +69,13 @@ const excelProps = {
type: 'excel',
};
const columnarProps = {
show: true,
onHide: () => {},
allowedExtensions: ['parquet', 'zip'],
type: 'columnar',
};
test('CSV, renders the general information elements correctly', () => {
render(<UploadDataModal {...csvProps} />, {
useRedux: true,
@ -200,6 +208,78 @@ test('Excel, renders the general information elements correctly', () => {
});
});
test('Columnar, renders the general information elements correctly', () => {
render(<UploadDataModal {...columnarProps} />, {
useRedux: true,
});
const cancelButton = screen.getByRole('button', {
name: 'Cancel',
});
const uploadButton = screen.getByRole('button', {
name: 'Upload',
});
const selectButton = screen.getByRole('button', {
name: 'Select',
});
const title = screen.getByRole('heading', {
name: /columnar upload/i,
});
const missingTitle = screen.queryByRole('heading', {
name: /csv upload/i,
});
expect(missingTitle).not.toBeInTheDocument();
const panel1 = screen.getByRole('heading', {
name: /General information/i,
});
const panel2 = screen.getByRole('heading', {
name: /file settings/i,
});
const panel3 = screen.getByRole('heading', {
name: /columns/i,
});
const panel4 = screen.queryByRole('heading', {
name: /rows/i,
});
expect(panel4).not.toBeInTheDocument();
const selectDatabase = screen.getByRole('combobox', {
name: /select a database/i,
});
const selectDelimiter = screen.queryByRole('combobox', {
name: /choose a delimiter/i,
});
expect(selectDelimiter).not.toBeInTheDocument();
const selectSheetName = screen.queryByRole('combobox', {
name: /choose sheet name/i,
});
expect(selectSheetName).not.toBeInTheDocument();
const inputTableName = screen.getByRole('textbox', {
name: /table name/i,
});
const inputSchema = screen.getByRole('combobox', {
name: /schema/i,
});
const visibleComponents = [
cancelButton,
uploadButton,
selectButton,
title,
panel1,
panel2,
panel3,
selectDatabase,
inputTableName,
inputSchema,
];
visibleComponents.forEach(component => {
expect(component).toBeVisible();
});
});
test('CSV, renders the file settings elements correctly', () => {
render(<UploadDataModal {...csvProps} />, {
useRedux: true,
@ -282,6 +362,45 @@ test('Excel, renders the file settings elements correctly', () => {
});
});
test('Columnar, renders the file settings elements correctly', () => {
render(<UploadDataModal {...columnarProps} />, {
useRedux: true,
});
expect(screen.queryByText('If Table Already Exists')).not.toBeInTheDocument();
const panelHeader = screen.getByRole('heading', {
name: /file settings/i,
});
userEvent.click(panelHeader);
const selectTableAlreadyExists = screen.getByRole('combobox', {
name: /choose already exists/i,
});
const inputDecimalCharacter = screen.queryByRole('textbox', {
name: /decimal character/i,
});
expect(inputDecimalCharacter).not.toBeInTheDocument();
const selectColumnsDates = screen.queryByRole('combobox', {
name: /choose columns to be parsed as dates/i,
});
expect(selectColumnsDates).not.toBeInTheDocument();
const selectNullValues = screen.queryByRole('combobox', {
name: /null values/i,
});
expect(selectNullValues).not.toBeInTheDocument();
const switchSkipInitialSpace = screen.queryByText('skipInitialSpace');
expect(switchSkipInitialSpace).not.toBeInTheDocument();
const switchSkipBlankLines = screen.queryByText('skipBlankLines');
expect(switchSkipBlankLines).not.toBeInTheDocument();
const switchDayFirst = screen.queryByText('dayFirst');
expect(switchDayFirst).not.toBeInTheDocument();
const visibleComponents = [selectTableAlreadyExists];
visibleComponents.forEach(component => {
expect(component).toBeVisible();
});
});
test('CSV, renders the columns elements correctly', () => {
render(<UploadDataModal {...csvProps} />, {
useRedux: true,
@ -291,12 +410,13 @@ test('CSV, renders the columns elements correctly', () => {
name: /columns/i,
});
userEvent.click(panelHeader);
const switchDataFrameIndex = screen.getByTestId('dataFrameIndex');
userEvent.click(switchDataFrameIndex);
const selectIndexColumn = screen.getByRole('combobox', {
name: /Choose index column/i,
});
const switchDataFrameIndex = screen.getByTestId('dataFrameIndex');
const inputColumnLabels = screen.getByRole('textbox', {
name: /Column labels/i,
name: /Index label/i,
});
const selectColumnsToRead = screen.getByRole('combobox', {
name: /Choose columns to read/i,
@ -327,12 +447,13 @@ test('Excel, renders the columns elements correctly', () => {
name: /columns/i,
});
userEvent.click(panelHeader);
const switchDataFrameIndex = screen.getByTestId('dataFrameIndex');
userEvent.click(switchDataFrameIndex);
const selectIndexColumn = screen.getByRole('combobox', {
name: /Choose index column/i,
});
const switchDataFrameIndex = screen.getByTestId('dataFrameIndex');
const inputColumnLabels = screen.getByRole('textbox', {
name: /Column labels/i,
const inputIndexLabel = screen.getByRole('textbox', {
name: /Index label/i,
});
const selectColumnsToRead = screen.getByRole('combobox', {
name: /Choose columns to read/i,
@ -348,7 +469,45 @@ test('Excel, renders the columns elements correctly', () => {
const visibleComponents = [
selectIndexColumn,
switchDataFrameIndex,
inputColumnLabels,
inputIndexLabel,
selectColumnsToRead,
];
visibleComponents.forEach(component => {
expect(component).toBeVisible();
});
});
test('Columnar, renders the columns elements correctly', () => {
render(<UploadDataModal {...columnarProps} />, {
useRedux: true,
});
const panelHeader = screen.getByRole('heading', {
name: /columns/i,
});
userEvent.click(panelHeader);
const selectIndexColumn = screen.queryByRole('combobox', {
name: /Choose index column/i,
});
expect(selectIndexColumn).not.toBeInTheDocument();
const switchDataFrameIndex = screen.getByTestId('dataFrameIndex');
userEvent.click(switchDataFrameIndex);
const inputIndexLabel = screen.getByRole('textbox', {
name: /Index label/i,
});
const selectColumnsToRead = screen.getByRole('combobox', {
name: /Choose columns to read/i,
});
userEvent.click(selectColumnsToRead);
const columnDataTypes = screen.queryByRole('textbox', {
name: /Column data types/i,
});
expect(columnDataTypes).not.toBeInTheDocument();
const visibleComponents = [
switchDataFrameIndex,
inputIndexLabel,
selectColumnsToRead,
];
visibleComponents.forEach(component => {
@ -381,6 +540,17 @@ test('renders the rows elements correctly', () => {
});
});
test('Columnar, does not render the rows', () => {
render(<UploadDataModal {...columnarProps} />, {
useRedux: true,
});
const panelHeader = screen.queryByRole('heading', {
name: /rows/i,
});
expect(panelHeader).not.toBeInTheDocument();
});
test('database and schema are correctly populated', async () => {
render(<UploadDataModal {...csvProps} />, {
useRedux: true,
@ -546,6 +716,67 @@ test('Excel, form post', async () => {
expect(fileData.name).toBe('test.xls');
});
test('Columnar, form post', async () => {
render(<UploadDataModal {...columnarProps} />, {
useRedux: true,
});
const selectButton = screen.getByRole('button', {
name: 'Select',
});
userEvent.click(selectButton);
// Select a file from the file dialog
const file = new File(['test'], 'test.parquet', { type: 'text' });
const inputElement = document.querySelector('input[type="file"]');
if (inputElement) {
userEvent.upload(inputElement, file);
}
const selectDatabase = screen.getByRole('combobox', {
name: /select a database/i,
});
userEvent.click(selectDatabase);
await waitFor(() => screen.getByText('database1'));
await waitFor(() => screen.getByText('database2'));
screen.getByText('database1').click();
const selectSchema = screen.getByRole('combobox', {
name: /schema/i,
});
userEvent.click(selectSchema);
await waitFor(() => screen.getAllByText('public'));
screen.getAllByText('public')[1].click();
// Fill out form fields
const inputTableName = screen.getByRole('textbox', {
name: /table name/i,
});
userEvent.type(inputTableName, 'table1');
const uploadButton = screen.getByRole('button', {
name: 'Upload',
});
userEvent.click(uploadButton);
await waitFor(() =>
fetchMock.called('glob:*api/v1/database/1/columnar_upload/'),
);
// Get the matching fetch calls made
const matchingCalls = fetchMock.calls(
'glob:*api/v1/database/1/columnar_upload/',
);
expect(matchingCalls).toHaveLength(1);
const [_, options] = matchingCalls[0];
const formData = options?.body as FormData;
expect(formData.get('table_name')).toBe('table1');
expect(formData.get('schema')).toBe('public');
expect(formData.get('table_name')).toBe('table1');
const fileData = formData.get('file') as File;
expect(fileData.name).toBe('test.parquet');
});
test('CSV, validate file extension returns false', () => {
const invalidFileNames = ['out', 'out.exe', 'out.csv.exe', '.csv', 'out.xls'];
forEach(invalidFileNames, fileName => {
@ -572,6 +803,25 @@ test('Excel, validate file extension returns false', () => {
});
});
test('Columnar, validate file extension returns false', () => {
const invalidFileNames = [
'out',
'out.exe',
'out.parquet.exe',
'.parquet',
'out.excel',
];
forEach(invalidFileNames, fileName => {
const file: UploadFile<any> = {
name: fileName,
uid: 'xp',
size: 100,
type: 'text/csv',
};
expect(validateUploadFileExtension(file, ['parquet', 'zip'])).toBe(false);
});
});
test('CSV, validate file extension returns true', () => {
const invalidFileNames = ['out.csv', 'out.tsv', 'out.exe.csv', 'out a.csv'];
forEach(invalidFileNames, fileName => {
@ -597,3 +847,21 @@ test('Excel, validate file extension returns true', () => {
expect(validateUploadFileExtension(file, ['xls', 'xlsx'])).toBe(true);
});
});
test('Columnar, validate file extension returns true', () => {
const invalidFileNames = [
'out.parquet',
'out.zip',
'out.exe.zip',
'out a.parquet',
];
forEach(invalidFileNames, fileName => {
const file: UploadFile<any> = {
name: fileName,
uid: 'xp',
size: 100,
type: 'text/csv',
};
expect(validateUploadFileExtension(file, ['parquet', 'zip'])).toBe(true);
});
});

View File

@ -40,7 +40,6 @@ import { Input, InputNumber } from 'src/components/Input';
import rison from 'rison';
import { UploadChangeParam, UploadFile } from 'antd/lib/upload/interface';
import withToasts from 'src/components/MessageToasts/withToasts';
import * as XLSX from 'xlsx';
import {
antdCollapseStyles,
antDModalNoPaddingStyles,
@ -69,9 +68,25 @@ const CSVSpecificFields = [
'skip_blank_lines',
'day_first',
'column_data_types',
'column_dates',
'decimal_character',
'null_values',
'index_column',
'header_row',
'rows_to_read',
'skip_rows',
];
const ExcelSpecificFields = ['sheet_name'];
const ExcelSpecificFields = [
'sheet_name',
'column_dates',
'decimal_character',
'null_values',
'index_column',
'header_row',
'rows_to_read',
'skip_rows',
];
const ColumnarSpecificFields: string[] = [];
@ -89,6 +104,9 @@ const UploadTypeToSpecificFields: Record<UploadType, string[]> = {
columnar: ColumnarSpecificFields,
};
const isFieldATypeSpecificField = (field: string, type: UploadType) =>
UploadTypeToSpecificFields[type].includes(field);
interface UploadInfo {
table_name: string;
schema: string;
@ -106,11 +124,16 @@ interface UploadInfo {
column_dates: Array<string>;
index_column: string | null;
dataframe_index: boolean;
column_labels: string;
index_label: string;
columns_read: Array<string>;
column_data_types: string;
}
interface SheetColumnNames {
sheet_name: string;
column_names: string[];
}
const defaultUploadInfo: UploadInfo = {
table_name: '',
schema: '',
@ -128,7 +151,7 @@ const defaultUploadInfo: UploadInfo = {
column_dates: [],
index_column: null,
dataframe_index: false,
column_labels: '',
index_label: '',
columns_read: [],
column_data_types: '',
};
@ -136,7 +159,11 @@ const defaultUploadInfo: UploadInfo = {
// Allowed extensions to accept for file upload, users can always override this
// by selecting all file extensions on the OS file picker. Also ".txt" will
// allow all files to be selected.
const READ_HEADER_SIZE = 10000;
const allowedExtensionsToAccept = {
csv: '.csv, .tsv',
excel: '.xls, .xlsx',
columnar: '.parquet, .zip',
};
export const validateUploadFileExtension = (
file: UploadFile<any>,
@ -183,21 +210,17 @@ const UploadDataModal: FunctionComponent<UploadDataModalProps> = ({
const [fileList, setFileList] = useState<UploadFile[]>([]);
const [columns, setColumns] = React.useState<string[]>([]);
const [sheetNames, setSheetNames] = React.useState<string[]>([]);
const [currentSheetName, setCurrentSheetName] = React.useState<
string | undefined
>();
const [sheetsColumnNames, setSheetsColumnNames] = React.useState<
SheetColumnNames[]
>([]);
const [delimiter, setDelimiter] = useState<string>(',');
const [isLoading, setIsLoading] = useState<boolean>(false);
const [currentSchema, setCurrentSchema] = useState<string | undefined>();
const [currentDataframeIndex, setCurrentDataframeIndex] =
useState<boolean>(false);
const [previewUploadedFile, setPreviewUploadedFile] = useState<boolean>(true);
const [fileLoading, setFileLoading] = useState<boolean>(false);
const allowedExtensionsToAccept = {
csv: '.csv, .tsv',
excel: '.xls, .xlsx',
columnar: '.parquet, .orc',
};
const createTypeToEndpointMap = (
databaseId: number,
): { [key: string]: string } => ({
@ -206,6 +229,12 @@ const UploadDataModal: FunctionComponent<UploadDataModalProps> = ({
columnar: `/api/v1/database/${databaseId}/columnar_upload/`,
});
const typeToFileMetadataEndpoint = {
csv: '/api/v1/database/csv_metadata/',
excel: '/api/v1/database/excel_metadata/',
columnar: '/api/v1/database/columnar_metadata/',
};
const nullValuesOptions = [
{
value: '""',
@ -286,12 +315,12 @@ const UploadDataModal: FunctionComponent<UploadDataModalProps> = ({
setColumns([]);
setCurrentSchema('');
setCurrentDatabaseId(0);
setCurrentSheetName(undefined);
setSheetNames([]);
setIsLoading(false);
setDelimiter(',');
setPreviewUploadedFile(true);
setFileLoading(false);
setSheetsColumnNames([]);
form.resetFields();
};
@ -343,6 +372,58 @@ const UploadDataModal: FunctionComponent<UploadDataModalProps> = ({
[currentDatabaseId],
);
const loadFileMetadata = (file: File) => {
const fields = form.getFieldsValue();
const mergedValues = { ...defaultUploadInfo, ...fields };
const formData = new FormData();
formData.append('file', file);
if (type === 'csv') {
formData.append('delimiter', mergedValues.delimiter);
}
setFileLoading(true);
return SupersetClient.post({
endpoint: typeToFileMetadataEndpoint[type],
body: formData,
headers: { Accept: 'application/json' },
})
.then(response => {
const { items } = response.json.result;
if (items && type !== 'excel') {
setColumns(items[0].column_names);
} else {
const { allSheetNames, sheetColumnNamesMap } = items.reduce(
(
acc: {
allSheetNames: any[];
sheetColumnNamesMap: Record<string, string[]>;
},
item: { sheet_name: any; column_names: any },
) => {
acc.allSheetNames.push(item.sheet_name);
acc.sheetColumnNamesMap[item.sheet_name] = item.column_names;
return acc;
},
{ allSheetNames: [], sheetColumnNamesMap: {} },
);
setColumns(items[0].column_names);
setSheetNames(allSheetNames);
form.setFieldsValue({ sheet_name: allSheetNames[0] });
setSheetsColumnNames(sheetColumnNamesMap);
}
})
.catch(response =>
getClientErrorObject(response).then(error => {
addDangerToast(error.error || 'Error');
setColumns([]);
form.setFieldsValue({ sheet_name: undefined });
setSheetNames([]);
}),
)
.finally(() => {
setFileLoading(false);
});
};
const getAllFieldsNotInType = (): string[] => {
const specificFields = UploadTypeToSpecificFields[type] || [];
return [...AllSpecificFields].filter(
@ -353,7 +434,13 @@ const UploadDataModal: FunctionComponent<UploadDataModalProps> = ({
const appendFormData = (formData: FormData, data: Record<string, any>) => {
const allFieldsNotInType = getAllFieldsNotInType();
Object.entries(data).forEach(([key, value]) => {
if (!(allFieldsNotInType.includes(key) || NonNullFields.includes(key))) {
if (
!(
allFieldsNotInType.includes(key) ||
(NonNullFields.includes(key) &&
(value === undefined || value === null))
)
) {
formData.append(key, value);
}
});
@ -401,13 +488,12 @@ const UploadDataModal: FunctionComponent<UploadDataModalProps> = ({
setFileList(fileList.filter(file => file.uid !== removedFile.uid));
setColumns([]);
setSheetNames([]);
setCurrentSheetName(undefined);
form.setFieldsValue({ sheet_name: undefined });
return false;
};
const onSheetNameChange = (value: string) => {
setCurrentSheetName(value);
setColumns(sheetsColumnNames[value] ?? []);
};
const columnsToOptions = () =>
@ -422,97 +508,6 @@ const UploadDataModal: FunctionComponent<UploadDataModalProps> = ({
label: sheetName,
}));
const readFileContent = (file: File) =>
new Promise<string>((resolve, reject) => {
const reader = new FileReader();
reader.onload = event => {
if (event.target) {
const text = event.target.result as string;
resolve(text);
} else {
reject(new Error('Failed to read file content'));
}
};
reader.onerror = () => {
reject(new Error('Failed to read file content'));
};
reader.readAsText(file.slice(0, READ_HEADER_SIZE));
});
const processCSVFile = async (file: File) => {
try {
setFileLoading(true);
const text = await readFileContent(file);
const firstLine = text.split('\n')[0].trim();
const firstRow = firstLine
.split(delimiter)
.map(column => column.replace(/^"(.*)"$/, '$1'));
setColumns(firstRow);
setFileLoading(false);
} catch (error) {
addDangerToast('Failed to process file content');
setFileLoading(false);
}
};
const processExcelColumns = (workbook: XLSX.WorkBook, sn: string[]) => {
if (!workbook) {
return;
}
let cSheetName = currentSheetName;
if (!currentSheetName) {
setCurrentSheetName(sn[0]);
cSheetName = sn[0];
}
cSheetName = cSheetName || sn[0];
form.setFieldsValue({ sheet_name: cSheetName });
const worksheet = workbook.Sheets[cSheetName];
const worksheetRef: string = worksheet['!ref'] ? worksheet['!ref'] : '';
const range = XLSX.utils.decode_range(worksheetRef);
const columnNames = Array.from({ length: range.e.c + 1 }, (_, i) => {
const cellAddress = XLSX.utils.encode_cell({ r: 0, c: i });
return worksheet[cellAddress]?.v;
});
setColumns(columnNames);
};
const processExcelFile = async (file: File) =>
new Promise<string>((resolve, reject) => {
setFileLoading(true);
const reader = new FileReader();
reader.readAsBinaryString(file);
reader.onload = event => {
if (!event.target && event.target == null) {
reader.onerror = () => {
reject(new Error('Failed to read file content'));
};
return;
}
// Read workbook
const workbook = XLSX.read(event.target.result, { type: 'binary' });
if (workbook == null) {
reject(new Error('Failed to process file content'));
addDangerToast('Failed to process file content');
setFileLoading(false);
return;
}
// Extract sheet names
const tmpSheetNames = workbook.SheetNames;
if (tmpSheetNames.length < 1) {
reject(new Error('Failed to read file content'));
addDangerToast('Failed to process file content');
setFileLoading(false);
return;
}
processExcelColumns(workbook, tmpSheetNames);
setSheetNames(workbook.SheetNames);
setFileLoading(false);
resolve('success');
};
});
const onChangeFile = async (info: UploadChangeParam<any>) => {
setFileList([
{
@ -523,14 +518,7 @@ const UploadDataModal: FunctionComponent<UploadDataModalProps> = ({
if (!previewUploadedFile) {
return;
}
if (type === 'csv') {
await processCSVFile(info.file.originFileObj);
}
if (type === 'excel') {
setSheetNames([]);
setCurrentSheetName(undefined);
await processExcelFile(info.file.originFileObj);
}
await loadFileMetadata(info.file.originFileObj);
};
useEffect(() => {
@ -542,25 +530,10 @@ const UploadDataModal: FunctionComponent<UploadDataModalProps> = ({
if (!previewUploadedFile) {
return;
}
processCSVFile(fileList[0].originFileObj).then(r => r);
loadFileMetadata(fileList[0].originFileObj).then(r => r);
}
}, [delimiter]);
useEffect(() => {
(async () => {
if (
columns.length > 0 &&
fileList[0].originFileObj &&
fileList[0].originFileObj instanceof File
) {
if (!previewUploadedFile) {
return;
}
await processExcelFile(fileList[0].originFileObj);
}
})();
}, [currentSheetName]);
const validateUpload = (_: any, value: string) => {
if (fileList.length === 0) {
return Promise.reject(t('Uploading a file is required'));
@ -734,9 +707,9 @@ const UploadDataModal: FunctionComponent<UploadDataModalProps> = ({
</StyledFormItem>
</Col>
</Row>
{isFieldATypeSpecificField('delimiter', type) && (
<Row>
<Col span={24}>
{type === 'csv' && (
<StyledFormItemWithTip
label={t('Delimiter')}
tip={t('Select a delimiter for this data')}
@ -749,8 +722,12 @@ const UploadDataModal: FunctionComponent<UploadDataModalProps> = ({
allowNewOptions
/>
</StyledFormItemWithTip>
</Col>
</Row>
)}
{type === 'excel' && (
{isFieldATypeSpecificField('sheet_name', type) && (
<Row>
<Col span={24}>
<StyledFormItem label={t('Sheet name')} name="sheet_name">
<Select
ariaLabel={t('Choose sheet name')}
@ -762,9 +739,9 @@ const UploadDataModal: FunctionComponent<UploadDataModalProps> = ({
)}
/>
</StyledFormItem>
)}
</Col>
</Row>
)}
</Collapse.Panel>
<Collapse.Panel
header={
@ -794,6 +771,7 @@ const UploadDataModal: FunctionComponent<UploadDataModalProps> = ({
</StyledFormItemWithTip>
</Col>
</Row>
{isFieldATypeSpecificField('column_dates', type) && (
<Row>
<Col span={24}>
<StyledFormItem
@ -813,6 +791,8 @@ const UploadDataModal: FunctionComponent<UploadDataModalProps> = ({
</StyledFormItem>
</Col>
</Row>
)}
{isFieldATypeSpecificField('decimal_character', type) && (
<Row>
<Col span={24}>
<StyledFormItemWithTip
@ -824,6 +804,8 @@ const UploadDataModal: FunctionComponent<UploadDataModalProps> = ({
</StyledFormItemWithTip>
</Col>
</Row>
)}
{isFieldATypeSpecificField('null_values', type) && (
<Row>
<Col span={24}>
<StyledFormItemWithTip
@ -842,8 +824,8 @@ const UploadDataModal: FunctionComponent<UploadDataModalProps> = ({
</StyledFormItemWithTip>
</Col>
</Row>
{type === 'csv' && (
<>
)}
{isFieldATypeSpecificField('skip_initial_space', type) && (
<Row>
<Col span={24}>
<StyledFormItem name="skip_initial_space">
@ -854,6 +836,8 @@ const UploadDataModal: FunctionComponent<UploadDataModalProps> = ({
</StyledFormItem>
</Col>
</Row>
)}
{isFieldATypeSpecificField('skip_blank_lines', type) && (
<Row>
<Col span={24}>
<StyledFormItem name="skip_blank_lines">
@ -866,6 +850,8 @@ const UploadDataModal: FunctionComponent<UploadDataModalProps> = ({
</StyledFormItem>
</Col>
</Row>
)}
{isFieldATypeSpecificField('day_first', type) && (
<Row>
<Col span={24}>
<StyledFormItem name="day_first">
@ -878,7 +864,6 @@ const UploadDataModal: FunctionComponent<UploadDataModalProps> = ({
</StyledFormItem>
</Col>
</Row>
</>
)}
</Collapse.Panel>
<Collapse.Panel
@ -894,40 +879,6 @@ const UploadDataModal: FunctionComponent<UploadDataModalProps> = ({
}
key="3"
>
<Row>
<Col span={24}>
<StyledFormItemWithTip
label={t('Index Column')}
tip={t(
'Column to use as the row labels of the dataframe. Leave empty if no index column',
)}
name="index_column"
>
<Select
ariaLabel={t('Choose index column')}
options={columns.map(column => ({
value: column,
label: column,
}))}
allowClear
allowNewOptions
/>
</StyledFormItemWithTip>
</Col>
</Row>
<Row>
<Col span={24}>
<StyledFormItemWithTip
label={t('Column Label(s)')}
tip={t(
'Column label for index column(s). If None is given and Dataframe Index is checked, Index Names are used',
)}
name="column_labels"
>
<Input aria-label={t('Column labels')} type="text" />
</StyledFormItemWithTip>
</Col>
</Row>
<Row>
<Col span={24}>
<StyledFormItem
@ -947,7 +898,7 @@ const UploadDataModal: FunctionComponent<UploadDataModalProps> = ({
</StyledFormItem>
</Col>
</Row>
{type === 'csv' && (
{isFieldATypeSpecificField('column_data_types', type) && (
<Row>
<Col span={24}>
<StyledFormItemWithTip
@ -966,19 +917,64 @@ const UploadDataModal: FunctionComponent<UploadDataModalProps> = ({
<Col span={24}>
<StyledFormItem name="dataframe_index">
<SwitchContainer
label={t('Write dataframe index as a column')}
label={t('Create dataframe index')}
dataTest="dataFrameIndex"
onChange={setCurrentDataframeIndex}
/>
</StyledFormItem>
</Col>
</Row>
{currentDataframeIndex &&
isFieldATypeSpecificField('index_column', type) && (
<Row>
<Col span={24}>
<StyledFormItemWithTip
label={t('Index Column')}
tip={t(
'Column to use as the index of the dataframe. If None is given, Index label is used.',
)}
name="index_column"
>
<Select
ariaLabel={t('Choose index column')}
options={columns.map(column => ({
value: column,
label: column,
}))}
allowClear
allowNewOptions
/>
</StyledFormItemWithTip>
</Col>
</Row>
)}
{currentDataframeIndex && (
<Row>
<Col span={24}>
<StyledFormItemWithTip
label={t('Index Label')}
tip={t(
"Label for the index column. Don't use an existing column name.",
)}
name="index_label"
>
<Input aria-label={t('Index label')} type="text" />
</StyledFormItemWithTip>
</Col>
</Row>
)}
</Collapse.Panel>
{isFieldATypeSpecificField('header_row', type) &&
isFieldATypeSpecificField('rows_to_read', type) &&
isFieldATypeSpecificField('skip_rows', type) && (
<Collapse.Panel
header={
<div>
<h4>{t('Rows')}</h4>
<p className="helper">
{t('Set header rows and the number of rows to read or skip.')}
{t(
'Set header rows and the number of rows to read or skip.',
)}
</p>
</div>
}
@ -1019,13 +1015,16 @@ const UploadDataModal: FunctionComponent<UploadDataModalProps> = ({
label={t('Skip Rows')}
tip={t('Number of rows to skip at start of file.')}
name="skip_rows"
rules={[{ required: true, message: 'Skip rows is required' }]}
rules={[
{ required: true, message: 'Skip rows is required' },
]}
>
<InputNumber aria-label={t('Skip rows')} min={0} />
</StyledFormItemWithTip>
</Col>
</Row>
</Collapse.Panel>
)}
</Collapse>
</AntdForm>
</Modal>

View File

@ -43,19 +43,19 @@ const dropdownItems = [
{
label: 'Upload a CSV',
name: 'Upload a CSV',
url: '/csvtodatabaseview/form',
url: '#',
perm: true,
},
{
label: 'Upload a Columnar File',
name: 'Upload a Columnar file',
url: '/columnartodatabaseview/form',
url: '#',
perm: true,
},
{
label: 'Upload Excel',
name: 'Upload Excel',
url: '/exceltodatabaseview/form',
url: '#',
perm: true,
},
],

View File

@ -54,13 +54,13 @@ const dropdownItems = [
{
label: 'Upload CSV to database',
name: 'Upload a CSV',
url: '/csvtodatabaseview/form',
url: '#',
perm: true,
},
{
label: 'Upload columnar file to database',
name: 'Upload a Columnar file',
url: '/columnartodatabaseview/form',
url: '#',
perm: true,
},
],
@ -309,12 +309,10 @@ test('If there is a DB with allow_file_upload set as True the option should be e
userEvent.hover(dropdown);
const dataMenu = await screen.findByText(dropdownItems[0].label);
userEvent.hover(dataMenu);
expect(await screen.findByText('Upload CSV to database')).toBeInTheDocument();
expect(
(await screen.findByText('Upload CSV to database')).closest('a'),
).toHaveAttribute('href', '#');
expect(
(await screen.findByText('Upload Excel to database')).closest('a'),
).toHaveAttribute('href', '#');
await screen.findByText('Upload Excel to database'),
).toBeInTheDocument();
});
test('If there is NOT a DB with allow_file_upload set as True the option should be disabled', async () => {

View File

@ -45,6 +45,7 @@ import {
} from 'src/types/bootstrapTypes';
import { RootState } from 'src/dashboard/types';
import DatabaseModal from 'src/features/databases/DatabaseModal';
import UploadDataModal from 'src/features/databases/UploadDataModel';
import { uploadUserPerms } from 'src/views/CRUD/utils';
import TelemetryPixel from 'src/components/TelemetryPixel';
import LanguagePicker from './LanguagePicker';
@ -143,6 +144,11 @@ const RightMenu = ({
HAS_GSHEETS_INSTALLED,
} = useSelector<any, ExtensionConfigs>(state => state.common.conf);
const [showDatabaseModal, setShowDatabaseModal] = useState<boolean>(false);
const [showCSVUploadModal, setShowCSVUploadModal] = useState<boolean>(false);
const [showExcelUploadModal, setShowExcelUploadModal] =
useState<boolean>(false);
const [showColumnarUploadModal, setShowColumnarUploadModal] =
useState<boolean>(false);
const [engine, setEngine] = useState<string>('');
const canSql = findPermission('can_sqllab', 'Superset', roles);
const canDashboard = findPermission('can_write', 'Dashboard', roles);
@ -188,23 +194,20 @@ const RightMenu = ({
},
{
label: t('Upload CSV to database'),
name: 'Upload a CSV',
url: '#',
name: GlobalMenuDataOptions.CSVUpload,
perm: canUploadCSV && showUploads,
disable: isAdmin && !allowUploads,
},
{
label: t('Upload columnar file to database'),
name: 'Upload a Columnar file',
url: '/columnartodatabaseview/form',
perm: canUploadColumnar && showUploads,
label: t('Upload Excel to database'),
name: GlobalMenuDataOptions.ExcelUpload,
perm: canUploadExcel && showUploads,
disable: isAdmin && !allowUploads,
},
{
label: t('Upload Excel to database'),
name: 'Upload Excel',
url: '#',
perm: canUploadExcel && showUploads,
label: t('Upload Columnar file to database'),
name: GlobalMenuDataOptions.ColumnarUpload,
perm: canUploadColumnar && showUploads,
disable: isAdmin && !allowUploads,
},
],
@ -289,6 +292,12 @@ const RightMenu = ({
} else if (itemChose.key === GlobalMenuDataOptions.GoogleSheets) {
setShowDatabaseModal(true);
setEngine('Google Sheets');
} else if (itemChose.key === GlobalMenuDataOptions.CSVUpload) {
setShowCSVUploadModal(true);
} else if (itemChose.key === GlobalMenuDataOptions.ExcelUpload) {
setShowExcelUploadModal(true);
} else if (itemChose.key === GlobalMenuDataOptions.ColumnarUpload) {
setShowColumnarUploadModal(true);
}
};
@ -350,6 +359,30 @@ const RightMenu = ({
onDatabaseAdd={handleDatabaseAdd}
/>
)}
{canUploadCSV && (
<UploadDataModal
onHide={() => setShowCSVUploadModal(false)}
show={showCSVUploadModal}
allowedExtensions={CSV_EXTENSIONS}
type="csv"
/>
)}
{canUploadExcel && (
<UploadDataModal
onHide={() => setShowExcelUploadModal(false)}
show={showExcelUploadModal}
allowedExtensions={EXCEL_EXTENSIONS}
type="excel"
/>
)}
{canUploadColumnar && (
<UploadDataModal
onHide={() => setShowColumnarUploadModal(false)}
show={showColumnarUploadModal}
allowedExtensions={COLUMNAR_EXTENSIONS}
type="columnar"
/>
)}
{environmentTag?.text && (
<Label
css={{ borderRadius: `${theme.gridUnit * 125}px` }}

View File

@ -51,4 +51,7 @@ export enum GlobalMenuDataOptions {
GoogleSheets = 'gsheets',
DbConnection = 'dbconnection',
DatasetCreation = 'datasetCreation',
CSVUpload = 'csvUpload',
ExcelUpload = 'excelUpload',
ColumnarUpload = 'columnarUpload',
}

View File

@ -140,6 +140,8 @@ function DatabaseList({
useState<boolean>(false);
const [excelUploadDataModalOpen, setExcelUploadDataModalOpen] =
useState<boolean>(false);
const [columnarUploadDataModalOpen, setColumnarUploadDataModalOpen] =
useState<boolean>(false);
const [allowUploads, setAllowUploads] = useState<boolean>(false);
const isAdmin = isUserAdmin(fullUser);
@ -257,9 +259,12 @@ function DatabaseList({
disable: isDisabled,
},
{
label: t('Upload columnar file'),
label: t('Upload Columnar'),
name: 'Upload columnar file',
url: '/columnartodatabaseview/form',
url: '#',
onClick: () => {
setColumnarUploadDataModalOpen(true);
},
perm: canUploadColumnar && showUploads,
disable: isDisabled,
},
@ -577,6 +582,7 @@ function DatabaseList({
}}
show={csvUploadDataModalOpen}
allowedExtensions={CSV_EXTENSIONS}
type="csv"
/>
<UploadDataModal
addDangerToast={addDangerToast}
@ -588,6 +594,16 @@ function DatabaseList({
allowedExtensions={EXCEL_EXTENSIONS}
type="excel"
/>
<UploadDataModal
addDangerToast={addDangerToast}
addSuccessToast={addSuccessToast}
onHide={() => {
setColumnarUploadDataModalOpen(false);
}}
show={columnarUploadDataModalOpen}
allowedExtensions={COLUMNAR_EXTENSIONS}
type="columnar"
/>
{databaseCurrentlyDeleting && (
<DeleteModal
description={

View File

@ -491,7 +491,7 @@ export const uploadUserPerms = (
checkUploadExtensions(csvExt, allowedExt);
const canUploadColumnar =
checkUploadExtensions(colExt, allowedExt) &&
findPermission('can_this_form_get', 'ColumnarToDatabaseView', roles);
findPermission('can_columnar_upload', 'Database', roles);
const canUploadExcel =
checkUploadExtensions(excelExt, allowedExt) &&
findPermission('can_excel_upload', 'Database', roles);

View File

@ -14,7 +14,6 @@
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import logging
from abc import abstractmethod
from typing import Any, Optional, TypedDict
@ -22,6 +21,7 @@ from typing import Any, Optional, TypedDict
import pandas as pd
from flask_babel import lazy_gettext as _
from sqlalchemy.exc import SQLAlchemyError
from werkzeug.datastructures import FileStorage
from superset import db
from superset.commands.base import BaseCommand
@ -46,8 +46,17 @@ READ_CHUNK_SIZE = 1000
class ReaderOptions(TypedDict, total=False):
already_exists: str
column_labels: str
index_column: str
index_label: str
dataframe_index: bool
class FileMetadataItem(TypedDict):
sheet_name: Optional[str]
column_names: list[str]
class FileMetadata(TypedDict, total=False):
items: list[FileMetadataItem]
class BaseDataReader:
@ -57,14 +66,21 @@ class BaseDataReader:
to read data from multiple file types (e.g. CSV, Excel, etc.)
"""
def __init__(self, options: dict[str, Any]) -> None:
self._options = options
def __init__(self, options: Optional[dict[str, Any]] = None) -> None:
self._options = options or {}
@abstractmethod
def file_to_dataframe(self, file: Any) -> pd.DataFrame: ...
def file_to_dataframe(self, file: FileStorage) -> pd.DataFrame: ...
@abstractmethod
def file_metadata(self, file: FileStorage) -> FileMetadata: ...
def read(
self, file: Any, database: Database, table_name: str, schema_name: Optional[str]
self,
file: FileStorage,
database: Database,
table_name: str,
schema_name: Optional[str],
) -> None:
self._dataframe_to_database(
self.file_to_dataframe(file), database, table_name, schema_name
@ -85,16 +101,20 @@ class BaseDataReader:
"""
try:
data_table = Table(table=table_name, schema=schema_name)
to_sql_kwargs = {
"chunksize": READ_CHUNK_SIZE,
"if_exists": self._options.get("already_exists", "fail"),
"index": self._options.get("dataframe_index", False),
}
if self._options.get("index_label") and self._options.get(
"dataframe_index"
):
to_sql_kwargs["index_label"] = self._options.get("index_label")
database.db_engine_spec.df_to_sql(
database,
data_table,
df,
to_sql_kwargs={
"chunksize": READ_CHUNK_SIZE,
"if_exists": self._options.get("already_exists", "fail"),
"index": self._options.get("index_column"),
"index_label": self._options.get("column_labels"),
},
to_sql_kwargs=to_sql_kwargs,
)
except ValueError as ex:
raise DatabaseUploadFailed(

View File

@ -0,0 +1,134 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import logging
from collections.abc import Generator
from io import BytesIO
from pathlib import Path
from typing import Any, IO, Optional
from zipfile import BadZipfile, is_zipfile, ZipFile
import pandas as pd
import pyarrow.parquet as pq
from flask_babel import lazy_gettext as _
from pyarrow.lib import ArrowException
from werkzeug.datastructures import FileStorage
from superset.commands.database.exceptions import DatabaseUploadFailed
from superset.commands.database.uploaders.base import (
BaseDataReader,
FileMetadata,
ReaderOptions,
)
logger = logging.getLogger(__name__)
class ColumnarReaderOptions(ReaderOptions, total=False):
columns_read: list[str]
class ColumnarReader(BaseDataReader):
def __init__(
self,
options: Optional[ColumnarReaderOptions] = None,
) -> None:
options = options or {}
super().__init__(
options=dict(options),
)
def _read_buffer_to_dataframe(self, buffer: IO[bytes]) -> pd.DataFrame:
kwargs: dict[str, Any] = {
"path": buffer,
}
if self._options.get("columns_read"):
kwargs["columns"] = self._options.get("columns_read")
try:
return pd.read_parquet(**kwargs)
except (
pd.errors.ParserError,
pd.errors.EmptyDataError,
UnicodeDecodeError,
ValueError,
) as ex:
raise DatabaseUploadFailed(
message=_("Parsing error: %(error)s", error=str(ex))
) from ex
except Exception as ex:
raise DatabaseUploadFailed(_("Error reading Columnar file")) from ex
@staticmethod
def _yield_files(file: FileStorage) -> Generator[IO[bytes], None, None]:
"""
Yields files from the provided file. If the file is a zip file, it yields each
file within the zip file. If it's a single file, it yields the file itself.
:param file: The file to yield files from.
:return: A generator that yields files.
"""
file_suffix = Path(file.filename).suffix
if not file_suffix:
raise DatabaseUploadFailed(_("Unexpected no file extension found"))
file_suffix = file_suffix[1:] # remove the dot
if file_suffix == "zip":
if not is_zipfile(file):
raise DatabaseUploadFailed(_("Not a valid ZIP file"))
try:
with ZipFile(file) as zip_file:
# check if all file types are of the same extension
file_suffixes = {Path(name).suffix for name in zip_file.namelist()}
if len(file_suffixes) > 1:
raise DatabaseUploadFailed(
_("ZIP file contains multiple file types")
)
for filename in zip_file.namelist():
with zip_file.open(filename) as file_in_zip:
yield BytesIO(file_in_zip.read())
except BadZipfile as ex:
raise DatabaseUploadFailed(_("Not a valid ZIP file")) from ex
else:
yield file
def file_to_dataframe(self, file: FileStorage) -> pd.DataFrame:
"""
Read Columnar file into a DataFrame
:return: pandas DataFrame
:throws DatabaseUploadFailed: if there is an error reading the file
"""
return pd.concat(
self._read_buffer_to_dataframe(buffer) for buffer in self._yield_files(file)
)
def file_metadata(self, file: FileStorage) -> FileMetadata:
column_names = set()
try:
for file_item in self._yield_files(file):
parquet_file = pq.ParquetFile(file_item)
column_names.update(parquet_file.metadata.schema.names) # pylint: disable=no-member
except ArrowException as ex:
raise DatabaseUploadFailed(
message=_("Parsing error: %(error)s", error=str(ex))
) from ex
return {
"items": [
{
"column_names": list(column_names),
"sheet_name": None,
}
]
}

View File

@ -15,17 +15,23 @@
# specific language governing permissions and limitations
# under the License.
import logging
from typing import Any
from typing import Any, Optional
import pandas as pd
from flask_babel import lazy_gettext as _
from werkzeug.datastructures import FileStorage
from superset.commands.database.exceptions import DatabaseUploadFailed
from superset.commands.database.uploaders.base import BaseDataReader, ReaderOptions
from superset.commands.database.uploaders.base import (
BaseDataReader,
FileMetadata,
ReaderOptions,
)
logger = logging.getLogger(__name__)
READ_CSV_CHUNK_SIZE = 1000
ROWS_TO_READ_METADATA = 2
class CSVReaderOptions(ReaderOptions, total=False):
@ -33,7 +39,7 @@ class CSVReaderOptions(ReaderOptions, total=False):
column_data_types: dict[str, str]
column_dates: list[str]
columns_read: list[str]
dataframe_index: str
index_column: str
day_first: bool
decimal_character: str
header_row: int
@ -47,48 +53,27 @@ class CSVReaderOptions(ReaderOptions, total=False):
class CSVReader(BaseDataReader):
def __init__(
self,
options: CSVReaderOptions,
options: Optional[CSVReaderOptions] = None,
) -> None:
options = options or {}
super().__init__(
options=dict(options),
)
def file_to_dataframe(self, file: Any) -> pd.DataFrame:
"""
Read CSV file into a DataFrame
:return: pandas DataFrame
:throws DatabaseUploadFailed: if there is an error reading the CSV file
"""
@staticmethod
def _read_csv(file: FileStorage, kwargs: dict[str, Any]) -> pd.DataFrame:
try:
if "chunksize" in kwargs:
return pd.concat(
pd.read_csv(
chunksize=READ_CSV_CHUNK_SIZE,
encoding="utf-8",
filepath_or_buffer=file,
header=self._options.get("header_row", 0),
decimal=self._options.get("decimal_character", "."),
index_col=self._options.get("index_column"),
dayfirst=self._options.get("day_first", False),
iterator=True,
keep_default_na=not self._options.get("null_values"),
usecols=self._options.get("columns_read")
if self._options.get("columns_read") # None if an empty list
else None,
na_values=self._options.get("null_values")
if self._options.get("null_values") # None if an empty list
else None,
nrows=self._options.get("rows_to_read"),
parse_dates=self._options.get("column_dates"),
sep=self._options.get("delimiter", ","),
skip_blank_lines=self._options.get("skip_blank_lines", False),
skipinitialspace=self._options.get("skip_initial_space", False),
skiprows=self._options.get("skip_rows", 0),
dtype=self._options.get("column_data_types")
if self._options.get("column_data_types")
else None,
filepath_or_buffer=file.stream,
**kwargs,
)
)
return pd.read_csv(
filepath_or_buffer=file.stream,
**kwargs,
)
except (
pd.errors.ParserError,
pd.errors.EmptyDataError,
@ -100,3 +85,59 @@ class CSVReader(BaseDataReader):
) from ex
except Exception as ex:
raise DatabaseUploadFailed(_("Error reading CSV file")) from ex
def file_to_dataframe(self, file: FileStorage) -> pd.DataFrame:
"""
Read CSV file into a DataFrame
:return: pandas DataFrame
:throws DatabaseUploadFailed: if there is an error reading the file
"""
kwargs = {
"chunksize": READ_CSV_CHUNK_SIZE,
"encoding": "utf-8",
"header": self._options.get("header_row", 0),
"decimal": self._options.get("decimal_character", "."),
"index_col": self._options.get("index_column"),
"dayfirst": self._options.get("day_first", False),
"iterator": True,
"keep_default_na": not self._options.get("null_values"),
"usecols": self._options.get("columns_read")
if self._options.get("columns_read") # None if an empty list
else None,
"na_values": self._options.get("null_values")
if self._options.get("null_values") # None if an empty list
else None,
"nrows": self._options.get("rows_to_read"),
"parse_dates": self._options.get("column_dates"),
"sep": self._options.get("delimiter", ","),
"skip_blank_lines": self._options.get("skip_blank_lines", False),
"skipinitialspace": self._options.get("skip_initial_space", False),
"skiprows": self._options.get("skip_rows", 0),
"dtype": self._options.get("column_data_types")
if self._options.get("column_data_types")
else None,
}
return self._read_csv(file, kwargs)
def file_metadata(self, file: FileStorage) -> FileMetadata:
"""
Get metadata from a CSV file
:return: FileMetadata
:throws DatabaseUploadFailed: if there is an error reading the file
"""
kwargs = {
"nrows": ROWS_TO_READ_METADATA,
"header": self._options.get("header_row", 0),
"sep": self._options.get("delimiter", ","),
}
df = self._read_csv(file, kwargs)
return {
"items": [
{
"column_names": df.columns.tolist(),
"sheet_name": None,
}
]
}

View File

@ -15,22 +15,29 @@
# specific language governing permissions and limitations
# under the License.
import logging
from typing import Any
from typing import Optional
import pandas as pd
from flask_babel import lazy_gettext as _
from werkzeug.datastructures import FileStorage
from superset.commands.database.exceptions import DatabaseUploadFailed
from superset.commands.database.uploaders.base import BaseDataReader, ReaderOptions
from superset.commands.database.uploaders.base import (
BaseDataReader,
FileMetadata,
ReaderOptions,
)
logger = logging.getLogger(__name__)
ROWS_TO_READ_METADATA = 2
class ExcelReaderOptions(ReaderOptions, total=False):
sheet_name: str
column_dates: list[str]
columns_read: list[str]
dataframe_index: str
index_column: str
decimal_character: str
header_row: int
null_values: list[str]
@ -41,18 +48,19 @@ class ExcelReaderOptions(ReaderOptions, total=False):
class ExcelReader(BaseDataReader):
def __init__(
self,
options: ExcelReaderOptions,
options: Optional[ExcelReaderOptions] = None,
) -> None:
options = options or {}
super().__init__(
options=dict(options),
)
def file_to_dataframe(self, file: Any) -> pd.DataFrame:
def file_to_dataframe(self, file: FileStorage) -> pd.DataFrame:
"""
Read Excel file into a DataFrame
:return: pandas DataFrame
:throws DatabaseUploadFailed: if there is an error reading the CSV file
:throws DatabaseUploadFailed: if there is an error reading the file
"""
kwargs = {
@ -84,3 +92,25 @@ class ExcelReader(BaseDataReader):
) from ex
except Exception as ex:
raise DatabaseUploadFailed(_("Error reading Excel file")) from ex
def file_metadata(self, file: FileStorage) -> FileMetadata:
try:
excel_file = pd.ExcelFile(file)
except (ValueError, AssertionError) as ex:
raise DatabaseUploadFailed(
message=_("Excel file format cannot be determined")
) from ex
sheet_names = excel_file.sheet_names
result: FileMetadata = {"items": []}
for sheet in sheet_names:
df = excel_file.parse(sheet, nrows=ROWS_TO_READ_METADATA)
column_names = df.columns.tolist()
result["items"].append(
{
"sheet_name": sheet,
"column_names": column_names,
}
)
return result

View File

@ -167,6 +167,9 @@ MODEL_API_RW_METHOD_PERMISSION_MAP = {
"delete_object": "write",
"copy_dash": "write",
"get_connection": "write",
"excel_metadata": "excel_upload",
"columnar_metadata": "columnar_upload",
"csv_metadata": "csv_upload",
}
EXTRA_FORM_DATA_APPEND_KEYS = {

View File

@ -58,6 +58,7 @@ from superset.commands.database.tables import TablesDatabaseCommand
from superset.commands.database.test_connection import TestConnectionDatabaseCommand
from superset.commands.database.update import UpdateDatabaseCommand
from superset.commands.database.uploaders.base import UploadCommand
from superset.commands.database.uploaders.columnar_reader import ColumnarReader
from superset.commands.database.uploaders.csv_reader import CSVReader
from superset.commands.database.uploaders.excel_reader import ExcelReader
from superset.commands.database.validate import ValidateDatabaseParametersCommand
@ -72,6 +73,9 @@ from superset.daos.database import DatabaseDAO, DatabaseUserOAuth2TokensDAO
from superset.databases.decorators import check_table_access
from superset.databases.filters import DatabaseFilter, DatabaseUploadEnabledFilter
from superset.databases.schemas import (
ColumnarMetadataUploadFilePostSchema,
ColumnarUploadPostSchema,
CSVMetadataUploadFilePostSchema,
CSVUploadPostSchema,
database_schemas_query_schema,
database_tables_query_schema,
@ -84,6 +88,7 @@ from superset.databases.schemas import (
DatabaseTablesResponse,
DatabaseTestConnectionSchema,
DatabaseValidateParametersSchema,
ExcelMetadataUploadFilePostSchema,
ExcelUploadPostSchema,
get_export_ids_schema,
OAuth2ProviderResponseSchema,
@ -93,6 +98,7 @@ from superset.databases.schemas import (
SelectStarResponseSchema,
TableExtraMetadataResponseSchema,
TableMetadataResponseSchema,
UploadFileMetadata,
ValidateSQLRequest,
ValidateSQLResponse,
)
@ -151,7 +157,11 @@ class DatabaseRestApi(BaseSupersetModelRestApi):
"schemas_access_for_file_upload",
"get_connection",
"csv_upload",
"csv_metadata",
"excel_upload",
"excel_metadata",
"columnar_upload",
"columnar_metadata",
"oauth2",
}
@ -263,6 +273,7 @@ class DatabaseRestApi(BaseSupersetModelRestApi):
openapi_spec_tag = "Database"
openapi_spec_component_schemas = (
ColumnarUploadPostSchema,
CSVUploadPostSchema,
DatabaseConnectionSchema,
DatabaseFunctionNamesResponse,
@ -276,6 +287,10 @@ class DatabaseRestApi(BaseSupersetModelRestApi):
TableMetadataResponseSchema,
SelectStarResponseSchema,
SchemasResponseSchema,
CSVMetadataUploadFilePostSchema,
ExcelMetadataUploadFilePostSchema,
ColumnarMetadataUploadFilePostSchema,
UploadFileMetadata,
ValidateSQLRequest,
ValidateSQLResponse,
)
@ -1524,11 +1539,60 @@ class DatabaseRestApi(BaseSupersetModelRestApi):
command.run()
return self.response(200, message="OK")
@expose("/csv_metadata/", methods=("POST",))
@protect()
@statsd_metrics
@event_logger.log_this_with_context(
action=(
lambda self, *args, **kwargs: f"{self.__class__.__name__}" ".csv_metadata"
),
log_to_statsd=False,
)
@requires_form_data
def csv_metadata(self) -> Response:
"""Upload an CSV file and returns file metadata.
---
post:
summary: Upload an CSV file and returns file metadata
requestBody:
required: true
content:
multipart/form-data:
schema:
$ref: '#/components/schemas/CSVMetadataUploadFilePostSchema'
responses:
200:
description: Columnar upload response
content:
application/json:
schema:
type: object
properties:
result:
$ref: '#/components/schemas/UploadFileMetadata'
400:
$ref: '#/components/responses/400'
401:
$ref: '#/components/responses/401'
404:
$ref: '#/components/responses/404'
500:
$ref: '#/components/responses/500'
"""
try:
request_form = request.form.to_dict()
request_form["file"] = request.files.get("file")
parameters = CSVMetadataUploadFilePostSchema().load(request_form)
except ValidationError as error:
return self.response_400(message=error.messages)
metadata = CSVReader(parameters).file_metadata(parameters["file"])
return self.response(200, result=UploadFileMetadata().dump(metadata))
@expose("/<int:pk>/csv_upload/", methods=("POST",))
@protect()
@statsd_metrics
@event_logger.log_this_with_context(
action=lambda self, *args, **kwargs: f"{self.__class__.__name__}.import_",
action=lambda self, *args, **kwargs: f"{self.__class__.__name__}.csv_upload",
log_to_statsd=False,
)
@requires_form_data
@ -1549,7 +1613,7 @@ class DatabaseRestApi(BaseSupersetModelRestApi):
schema:
$ref: '#/components/schemas/CSVUploadPostSchema'
responses:
200:
201:
description: CSV upload response
content:
application/json:
@ -1582,13 +1646,62 @@ class DatabaseRestApi(BaseSupersetModelRestApi):
).run()
except ValidationError as error:
return self.response_400(message=error.messages)
return self.response(200, message="OK")
return self.response(201, message="OK")
@expose("/excel_metadata/", methods=("POST",))
@protect()
@statsd_metrics
@event_logger.log_this_with_context(
action=(
lambda self, *args, **kwargs: f"{self.__class__.__name__}" ".excel_metadata"
),
log_to_statsd=False,
)
@requires_form_data
def excel_metadata(self) -> Response:
"""Upload an Excel file and returns file metadata.
---
post:
summary: Upload an Excel file and returns file metadata
requestBody:
required: true
content:
multipart/form-data:
schema:
$ref: '#/components/schemas/ExcelMetadataUploadFilePostSchema'
responses:
200:
description: Columnar upload response
content:
application/json:
schema:
type: object
properties:
result:
$ref: '#/components/schemas/UploadFileMetadata'
400:
$ref: '#/components/responses/400'
401:
$ref: '#/components/responses/401'
404:
$ref: '#/components/responses/404'
500:
$ref: '#/components/responses/500'
"""
try:
request_form = request.form.to_dict()
request_form["file"] = request.files.get("file")
parameters = ExcelMetadataUploadFilePostSchema().load(request_form)
except ValidationError as error:
return self.response_400(message=error.messages)
metadata = ExcelReader().file_metadata(parameters["file"])
return self.response(200, result=UploadFileMetadata().dump(metadata))
@expose("/<int:pk>/excel_upload/", methods=("POST",))
@protect()
@statsd_metrics
@event_logger.log_this_with_context(
action=lambda self, *args, **kwargs: f"{self.__class__.__name__}.import_",
action=lambda self, *args, **kwargs: f"{self.__class__.__name__}.excel_upload",
log_to_statsd=False,
)
@requires_form_data
@ -1609,7 +1722,7 @@ class DatabaseRestApi(BaseSupersetModelRestApi):
schema:
$ref: '#/components/schemas/ExcelUploadPostSchema'
responses:
200:
201:
description: Excel upload response
content:
application/json:
@ -1642,7 +1755,117 @@ class DatabaseRestApi(BaseSupersetModelRestApi):
).run()
except ValidationError as error:
return self.response_400(message=error.messages)
return self.response(200, message="OK")
return self.response(201, message="OK")
@expose("/columnar_metadata/", methods=("POST",))
@protect()
@statsd_metrics
@event_logger.log_this_with_context(
action=lambda self, *args, **kwargs: f"{self.__class__.__name__}"
".columnar_metadata",
log_to_statsd=False,
)
@requires_form_data
def columnar_metadata(self) -> Response:
"""Upload a Columnar file and returns file metadata.
---
post:
summary: Upload a Columnar file and returns file metadata
requestBody:
required: true
content:
multipart/form-data:
schema:
$ref: '#/components/schemas/ColumnarMetadataUploadFilePostSchema'
responses:
200:
description: Columnar upload response
content:
application/json:
schema:
type: object
properties:
result:
$ref: '#/components/schemas/UploadFileMetadata'
400:
$ref: '#/components/responses/400'
401:
$ref: '#/components/responses/401'
404:
$ref: '#/components/responses/404'
500:
$ref: '#/components/responses/500'
"""
try:
request_form = request.form.to_dict()
request_form["file"] = request.files.get("file")
parameters = ColumnarMetadataUploadFilePostSchema().load(request_form)
except ValidationError as error:
return self.response_400(message=error.messages)
metadata = ColumnarReader().file_metadata(parameters["file"])
return self.response(200, result=UploadFileMetadata().dump(metadata))
@expose("/<int:pk>/columnar_upload/", methods=("POST",))
@protect()
@statsd_metrics
@event_logger.log_this_with_context(
action=lambda self,
*args,
**kwargs: f"{self.__class__.__name__}.columnar_upload",
log_to_statsd=False,
)
@requires_form_data
def columnar_upload(self, pk: int) -> Response:
"""Upload a Columnar file into a database.
---
post:
summary: Upload a Columnar file to a database table
parameters:
- in: path
schema:
type: integer
name: pk
requestBody:
required: true
content:
multipart/form-data:
schema:
$ref: '#/components/schemas/ColumnarUploadPostSchema'
responses:
201:
description: Columnar upload response
content:
application/json:
schema:
type: object
properties:
message:
type: string
400:
$ref: '#/components/responses/400'
401:
$ref: '#/components/responses/401'
404:
$ref: '#/components/responses/404'
422:
$ref: '#/components/responses/422'
500:
$ref: '#/components/responses/500'
"""
try:
request_form = request.form.to_dict()
request_form["file"] = request.files.get("file")
parameters = ColumnarUploadPostSchema().load(request_form)
UploadCommand(
pk,
parameters["table_name"],
parameters["file"],
parameters.get("schema"),
ColumnarReader(parameters),
).run()
except ValidationError as error:
return self.response_400(message=error.messages)
return self.response(201, message="OK")
@expose("/<int:pk>/function_names/", methods=("GET",))
@protect()

View File

@ -22,7 +22,7 @@ from __future__ import annotations
import inspect
import json
import os
import re
from pathlib import Path
from typing import Any, TypedDict
from flask import current_app
@ -1054,7 +1054,20 @@ class DelimitedListField(fields.List):
) from exc
class BaseUploadPostSchema(Schema):
class BaseUploadFilePostSchema(Schema):
_extension_config_key = ""
@validates("file")
def validate_file_extension(self, file: FileStorage) -> None:
allowed_extensions = current_app.config["ALLOWED_EXTENSIONS"].intersection(
current_app.config[self._extension_config_key]
)
file_suffix = Path(file.filename).suffix
if not file_suffix or file_suffix[1:] not in allowed_extensions:
raise ValidationError([_("File extension is not allowed.")])
class BaseUploadPostSchema(BaseUploadFilePostSchema):
already_exists = fields.String(
load_default="fail",
validate=OneOf(choices=("fail", "replace", "append")),
@ -1063,6 +1076,79 @@ class BaseUploadPostSchema(Schema):
"exists accepts: fail, replace, append"
},
)
index_label = fields.String(
metadata={"description": "Index label for index column."}
)
columns_read = DelimitedListField(
fields.String(),
metadata={"description": "A List of the column names that should be read"},
)
dataframe_index = fields.Boolean(
metadata={"description": "Write dataframe index as a column."}
)
schema = fields.String(
metadata={"description": "The schema to upload the data file to."}
)
table_name = fields.String(
required=True,
validate=[Length(min=1, max=10000)],
allow_none=False,
metadata={"description": "The name of the table to be created/appended"},
)
class ColumnarUploadPostSchema(BaseUploadPostSchema):
"""
Schema for Columnar Upload
"""
_extension_config_key = "COLUMNAR_EXTENSIONS"
file = fields.Raw(
required=True,
metadata={
"description": "The Columnar file to upload",
"type": "string",
"format": "binary",
},
)
class CSVUploadPostSchema(BaseUploadPostSchema):
"""
Schema for CSV Upload
"""
_extension_config_key = "CSV_EXTENSIONS"
file = fields.Raw(
required=True,
metadata={
"description": "The CSV file to upload",
"type": "string",
"format": "text/csv",
},
)
delimiter = fields.String(metadata={"description": "The delimiter of the CSV file"})
column_data_types = fields.String(
metadata={
"description": "A dictionary with column names and "
"their data types if you need to change "
"the defaults. Example: {'user_id':'int'}. "
"Check Python Pandas library for supported data types"
}
)
day_first = fields.Boolean(
metadata={
"description": "DD/MM format dates, international and European format"
}
)
skip_blank_lines = fields.Boolean(
metadata={"description": "Skip blank lines in the CSV file."}
)
skip_initial_space = fields.Boolean(
metadata={"description": "Skip spaces after delimiter."}
)
column_dates = DelimitedListField(
fields.String(),
metadata={
@ -1070,23 +1156,6 @@ class BaseUploadPostSchema(Schema):
"parsed as dates. Example: date,timestamp"
},
)
column_labels = fields.String(
metadata={
"description": "Column label for index column(s). "
"If None is given and Dataframe"
"Index is checked, Index Names are used"
}
)
columns_read = DelimitedListField(
fields.String(),
metadata={"description": "A List of the column names that should be read"},
)
dataframe_index = fields.String(
metadata={
"description": "Column to use as the row labels of the dataframe. "
"Leave empty if no index column"
}
)
decimal_character = fields.String(
metadata={
"description": "Character to recognize as decimal point. Default is '.'"
@ -1120,54 +1189,10 @@ class BaseUploadPostSchema(Schema):
allow_none=True,
validate=Range(min=1),
)
schema = fields.String(
metadata={"description": "The schema to upload the data file to."}
)
table_name = fields.String(
required=True,
validate=[Length(min=1, max=10000)],
allow_none=False,
metadata={"description": "The name of the table to be created/appended"},
)
skip_rows = fields.Integer(
metadata={"description": "Number of rows to skip at start of file."}
)
class CSVUploadPostSchema(BaseUploadPostSchema):
"""
Schema for CSV Upload
"""
file = fields.Raw(
required=True,
metadata={
"description": "The CSV file to upload",
"type": "string",
"format": "text/csv",
},
)
delimiter = fields.String(metadata={"description": "The delimiter of the CSV file"})
column_data_types = fields.String(
metadata={
"description": "A dictionary with column names and "
"their data types if you need to change "
"the defaults. Example: {'user_id':'int'}. "
"Check Python Pandas library for supported data types"
}
)
day_first = fields.Boolean(
metadata={
"description": "DD/MM format dates, international and European format"
}
)
skip_blank_lines = fields.Boolean(
metadata={"description": "Skip blank lines in the CSV file."}
)
skip_initial_space = fields.Boolean(
metadata={"description": "Skip spaces after delimiter."}
)
@post_load
def convert_column_data_types(
self, data: dict[str, Any], **kwargs: Any
@ -1191,24 +1216,14 @@ class CSVUploadPostSchema(BaseUploadPostSchema):
):
raise ValidationError([_("File size exceeds the maximum allowed size.")])
@validates("file")
def validate_file_extension(self, file: FileStorage) -> None:
allowed_extensions = current_app.config["ALLOWED_EXTENSIONS"].intersection(
current_app.config["CSV_EXTENSIONS"]
)
matches = re.match(r".+\.([^.]+)$", file.filename)
if not matches:
raise ValidationError([_("File extension is not allowed.")])
extension = matches.group(1)
if extension not in allowed_extensions:
raise ValidationError([_("File extension is not allowed.")])
class ExcelUploadPostSchema(BaseUploadPostSchema):
"""
Schema for Excel Upload
"""
_extension_config_key = "EXCEL_EXTENSIONS"
file = fields.Raw(
required=True,
metadata={
@ -1223,18 +1238,129 @@ class ExcelUploadPostSchema(BaseUploadPostSchema):
"(default is the first sheet)."
}
)
@validates("file")
def validate_file_extension(self, file: FileStorage) -> None:
allowed_extensions = current_app.config["ALLOWED_EXTENSIONS"].intersection(
current_app.config["EXCEL_EXTENSIONS"]
column_dates = DelimitedListField(
fields.String(),
metadata={
"description": "A list of column names that should be "
"parsed as dates. Example: date,timestamp"
},
)
matches = re.match(r".+\.([^.]+)$", file.filename)
if not matches:
raise ValidationError([_("File extension is not allowed.")])
extension = matches.group(1)
if extension not in allowed_extensions:
raise ValidationError([_("File extension is not allowed.")])
decimal_character = fields.String(
metadata={
"description": "Character to recognize as decimal point. Default is '.'"
}
)
header_row = fields.Integer(
metadata={
"description": "Row containing the headers to use as column names"
"(0 is first line of data). Leave empty if there is no header row."
}
)
index_column = fields.String(
metadata={
"description": "Column to use as the row labels of the dataframe. "
"Leave empty if no index column"
}
)
null_values = DelimitedListField(
fields.String(),
metadata={
"description": "A list of strings that should be treated as null. "
"Examples: '' for empty strings, 'None', 'N/A',"
"Warning: Hive database supports only a single value"
},
)
rows_to_read = fields.Integer(
metadata={
"description": "Number of rows to read from the file. "
"If None, reads all rows."
},
allow_none=True,
validate=Range(min=1),
)
skip_rows = fields.Integer(
metadata={"description": "Number of rows to skip at start of file."}
)
class CSVMetadataUploadFilePostSchema(BaseUploadFilePostSchema):
"""
Schema for CSV metadata.
"""
_extension_config_key = "CSV_EXTENSIONS"
file = fields.Raw(
required=True,
metadata={
"description": "The file to upload",
"type": "string",
"format": "binary",
},
)
delimiter = fields.String(metadata={"description": "The delimiter of the CSV file"})
header_row = fields.Integer(
metadata={
"description": "Row containing the headers to use as column names"
"(0 is first line of data). Leave empty if there is no header row."
}
)
class ExcelMetadataUploadFilePostSchema(BaseUploadFilePostSchema):
"""
Schema for CSV metadata.
"""
_extension_config_key = "EXCEL_EXTENSIONS"
file = fields.Raw(
required=True,
metadata={
"description": "The file to upload",
"type": "string",
"format": "binary",
},
)
header_row = fields.Integer(
metadata={
"description": "Row containing the headers to use as column names"
"(0 is first line of data). Leave empty if there is no header row."
}
)
class ColumnarMetadataUploadFilePostSchema(BaseUploadFilePostSchema):
"""
Schema for CSV metadata.
"""
_extension_config_key = "COLUMNAR_EXTENSIONS"
file = fields.Raw(
required=True,
metadata={
"description": "The file to upload",
"type": "string",
"format": "binary",
},
)
class UploadFileMetadataItemSchema(Schema):
sheet_name = fields.String(metadata={"description": "The name of the sheet"})
column_names = fields.List(
fields.String(),
metadata={"description": "A list of columns names in the sheet"},
)
class UploadFileMetadata(Schema):
"""
Schema for upload file metadata response.
"""
items = fields.List(fields.Nested(UploadFileMetadataItemSchema))
class OAuth2ProviderResponseSchema(Schema):

View File

@ -170,7 +170,7 @@ class SupersetAppInitializer: # pylint: disable=too-many-public-methods
DashboardModelView,
DashboardModelViewAsync,
)
from superset.views.database.views import ColumnarToDatabaseView, DatabaseView
from superset.views.database.views import DatabaseView
from superset.views.datasource.views import DatasetEditor, Datasource
from superset.views.dynamic_plugins import DynamicPluginsView
from superset.views.explore import ExplorePermalinkView, ExploreView
@ -291,7 +291,6 @@ class SupersetAppInitializer: # pylint: disable=too-many-public-methods
#
appbuilder.add_view_no_menu(Api)
appbuilder.add_view_no_menu(CssTemplateAsyncModelView)
appbuilder.add_view_no_menu(ColumnarToDatabaseView)
appbuilder.add_view_no_menu(Dashboard)
appbuilder.add_view_no_menu(DashboardModelViewAsync)
appbuilder.add_view_no_menu(Datasource)

View File

@ -0,0 +1,88 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
"""mig new columnar upload perm
Revision ID: 4a33124c18ad
Revises: 5f57af97bc3f
Create Date: 2024-04-26 12:36:07.800489
"""
# revision identifiers, used by Alembic.
revision = "4a33124c18ad"
down_revision = "5f57af97bc3f"
from alembic import op # noqa: E402
from sqlalchemy.exc import SQLAlchemyError # noqa: E402
from sqlalchemy.orm import Session # noqa: E402
from superset.migrations.shared.security_converge import ( # noqa: E402
add_pvms,
get_reversed_new_pvms,
get_reversed_pvm_map,
migrate_roles,
Pvm,
)
NEW_PVMS = {"Database": ("can_columnar_upload",)}
PVM_MAP = {
Pvm("ColumnarToDatabaseView", "can_this_form_post"): (
Pvm("Database", "can_columnar_upload"),
),
Pvm("ColumnarToDatabaseView", "can_this_form_get"): (
Pvm("Database", "can_columnar_upload"),
),
}
def do_upgrade(session: Session) -> None:
add_pvms(session, NEW_PVMS)
migrate_roles(session, PVM_MAP)
def do_downgrade(session: Session) -> None:
add_pvms(session, get_reversed_new_pvms(PVM_MAP))
migrate_roles(session, get_reversed_pvm_map(PVM_MAP))
def upgrade():
bind = op.get_bind()
session = Session(bind=bind)
do_upgrade(session)
try:
session.commit()
except SQLAlchemyError as ex:
session.rollback()
raise Exception(f"An error occurred while upgrading permissions: {ex}")
def downgrade():
bind = op.get_bind()
session = Session(bind=bind)
do_downgrade(session)
try:
session.commit()
except SQLAlchemyError as ex:
print(f"An error occurred while downgrading permissions: {ex}")
session.rollback()
pass

View File

@ -1,174 +0,0 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
"""Contains the logic to create cohesive forms on the explore view"""
from flask_appbuilder.fields import QuerySelectField
from flask_appbuilder.fieldwidgets import BS3TextFieldWidget
from flask_appbuilder.forms import DynamicForm
from flask_babel import lazy_gettext as _
from flask_wtf.file import FileAllowed
from wtforms import BooleanField, MultipleFileField, SelectField, StringField
from wtforms.validators import DataRequired, Optional, Regexp
from superset import app, db, security_manager
from superset.forms import JsonListField
from superset.models.core import Database
config = app.config
class UploadToDatabaseForm(DynamicForm):
@staticmethod
def file_allowed_dbs() -> list[Database]:
file_enabled_dbs = (
db.session.query(Database).filter_by(allow_file_upload=True).all()
)
return [
file_enabled_db
for file_enabled_db in file_enabled_dbs
if UploadToDatabaseForm.at_least_one_schema_is_allowed(file_enabled_db)
and UploadToDatabaseForm.is_engine_allowed_to_file_upl(file_enabled_db)
]
@staticmethod
def at_least_one_schema_is_allowed(database: Database) -> bool:
"""
If the user has access to the database or all datasource
1. if schemas_allowed_for_file_upload is empty
a) if database does not support schema
user is able to upload csv without specifying schema name
b) if database supports schema
user is able to upload csv to any schema
2. if schemas_allowed_for_file_upload is not empty
a) if database does not support schema
This situation is impossible and upload will fail
b) if database supports schema
user is able to upload to schema in schemas_allowed_for_file_upload
elif the user does not access to the database or all datasource
1. if schemas_allowed_for_file_upload is empty
a) if database does not support schema
user is unable to upload csv
b) if database supports schema
user is unable to upload csv
2. if schemas_allowed_for_file_upload is not empty
a) if database does not support schema
This situation is impossible and user is unable to upload csv
b) if database supports schema
user is able to upload to schema in schemas_allowed_for_file_upload
"""
if security_manager.can_access_database(database):
return True
schemas = database.get_schema_access_for_file_upload()
if schemas and security_manager.get_schemas_accessible_by_user(
database, schemas, False
):
return True
return False
@staticmethod
def is_engine_allowed_to_file_upl(database: Database) -> bool:
"""
This method is mainly used for existing Gsheets and Clickhouse DBs
that have allow_file_upload set as True but they are no longer valid
DBs for file uploading.
New GSheets and Clickhouse DBs won't have the option to set
allow_file_upload set as True.
"""
if database.db_engine_spec.supports_file_upload:
return True
return False
class ColumnarToDatabaseForm(UploadToDatabaseForm):
name = StringField(
_("Table Name"),
description=_("Name of table to be created from columnar data."),
validators=[
DataRequired(),
Regexp(r"^[^\.]+$", message=_("Table name cannot contain a schema")),
],
widget=BS3TextFieldWidget(),
)
columnar_file = MultipleFileField(
_("Columnar File"),
description=_("Select a Columnar file to be uploaded to a database."),
validators=[
DataRequired(),
FileAllowed(
config["ALLOWED_EXTENSIONS"].intersection(
config["COLUMNAR_EXTENSIONS"]
),
_(
"Only the following file extensions are allowed: "
"%(allowed_extensions)s",
allowed_extensions=", ".join(
config["ALLOWED_EXTENSIONS"].intersection(
config["COLUMNAR_EXTENSIONS"]
)
),
),
),
],
)
database = QuerySelectField(
_("Database"),
query_func=UploadToDatabaseForm.file_allowed_dbs,
get_pk_func=lambda a: a.id,
get_label=lambda a: a.database_name,
)
schema = StringField(
_("Schema"),
description=_("Specify a schema (if database flavor supports this)."),
validators=[Optional()],
widget=BS3TextFieldWidget(),
)
if_exists = SelectField(
_("Table Exists"),
description=_(
"If table exists do one of the following: "
"Fail (do nothing), Replace (drop and recreate table) "
"or Append (insert data)."
),
choices=[
("fail", _("Fail")),
("replace", _("Replace")),
("append", _("Append")),
],
validators=[DataRequired()],
)
usecols = JsonListField(
_("Use Columns"),
default=None,
description=_(
"Json list of the column names that should be read. "
"If not None, only these columns will be read from the file."
),
validators=[Optional()],
)
index = BooleanField(
_("Dataframe Index"), description=_("Write dataframe index as a column.")
)
index_label = StringField(
_("Column Label(s)"),
description=_(
"Column label for index column(s). If None is given "
"and Dataframe Index is True, Index Names are used."
),
validators=[Optional()],
widget=BS3TextFieldWidget(),
)

View File

@ -14,34 +14,26 @@
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import io
import zipfile
from typing import Any, TYPE_CHECKING
import pandas as pd
from flask import flash, g, redirect
from flask import redirect
from flask_appbuilder import expose, SimpleFormView
from flask_appbuilder.models.sqla.interface import SQLAInterface
from flask_appbuilder.security.decorators import has_access
from flask_babel import lazy_gettext as _
from werkzeug.wrappers import Response
from wtforms.fields import StringField
from wtforms.validators import ValidationError
import superset.models.core as models
from superset import app, db
from superset.connectors.sqla.models import SqlaTable
from superset import app
from superset.constants import MODEL_VIEW_RW_METHOD_PERMISSION_MAP, RouteMethod
from superset.exceptions import CertificateException
from superset.extensions import event_logger
from superset.sql_parse import Table
from superset.superset_typing import FlaskResponse
from superset.utils import core as utils
from superset.views.base import DeleteMixin, SupersetModelView, YamlExportMixin
from .forms import ColumnarToDatabaseForm
from .mixins import DatabaseMixin
from .validators import schema_allows_file_upload, sqlalchemy_uri_validator
from .validators import sqlalchemy_uri_validator
if TYPE_CHECKING:
from werkzeug.datastructures import FileStorage
@ -148,145 +140,3 @@ class CustomFormView(SimpleFormView):
form=form,
appbuilder=self.appbuilder,
)
class ColumnarToDatabaseView(SimpleFormView):
form = ColumnarToDatabaseForm
form_template = "superset/form_view/columnar_to_database_view/edit.html"
form_title = _("Columnar to Database configuration")
add_columns = ["database", "schema", "table_name"]
def form_get(self, form: ColumnarToDatabaseForm) -> None:
form.if_exists.data = "fail"
def form_post( # pylint: disable=too-many-locals
self, form: ColumnarToDatabaseForm
) -> Response:
database = form.database.data
columnar_table = Table(table=form.name.data, schema=form.schema.data)
files = form.columnar_file.data
file_type = {file.filename.split(".")[-1] for file in files}
if file_type == {"zip"}:
zipfile_ob = zipfile.ZipFile( # pylint: disable=consider-using-with
form.columnar_file.data[0]
)
file_type = {filename.split(".")[-1] for filename in zipfile_ob.namelist()}
files = [
# pylint: disable=consider-using-with
io.BytesIO((zipfile_ob.open(filename).read(), filename)[0])
for filename in zipfile_ob.namelist()
]
if len(file_type) > 1:
message = _(
"Multiple file extensions are not allowed for columnar uploads."
" Please make sure all files are of the same extension.",
)
flash(message, "danger")
return redirect("/columnartodatabaseview/form")
read = pd.read_parquet
kwargs = {
"columns": form.usecols.data if form.usecols.data else None,
}
if not schema_allows_file_upload(database, columnar_table.schema):
message = _(
'Database "%(database_name)s" schema "%(schema_name)s" '
"is not allowed for columnar uploads. "
"Please contact your Superset Admin.",
database_name=database.database_name,
schema_name=columnar_table.schema,
)
flash(message, "danger")
return redirect("/columnartodatabaseview/form")
try:
chunks = [read(file, **kwargs) for file in files]
df = pd.concat(chunks)
database = (
db.session.query(models.Database)
.filter_by(id=form.data.get("database").data.get("id"))
.one()
)
database.db_engine_spec.df_to_sql(
database,
columnar_table,
df,
to_sql_kwargs={
"chunksize": 1000,
"if_exists": form.if_exists.data,
"index": form.index.data,
"index_label": form.index_label.data,
},
)
# Connect table to the database that should be used for exploration.
# E.g. if hive was used to upload a csv, presto will be a better option
# to explore the table.
explore_database = database
explore_database_id = database.explore_database_id
if explore_database_id:
explore_database = (
db.session.query(models.Database)
.filter_by(id=explore_database_id)
.one_or_none()
or database
)
sqla_table = (
db.session.query(SqlaTable)
.filter_by(
table_name=columnar_table.table,
schema=columnar_table.schema,
database_id=explore_database.id,
)
.one_or_none()
)
if sqla_table:
sqla_table.fetch_metadata()
if not sqla_table:
sqla_table = SqlaTable(table_name=columnar_table.table)
sqla_table.database = explore_database
sqla_table.database_id = database.id
sqla_table.owners = [g.user]
sqla_table.schema = columnar_table.schema
sqla_table.fetch_metadata()
db.session.add(sqla_table)
db.session.commit()
except Exception as ex: # pylint: disable=broad-except
db.session.rollback()
message = _(
'Unable to upload Columnar file "%(filename)s" to table '
'"%(table_name)s" in database "%(db_name)s". '
"Error message: %(error_msg)s",
filename=[file.filename for file in form.columnar_file.data],
table_name=form.name.data,
db_name=database.database_name,
error_msg=str(ex),
)
flash(message, "danger")
stats_logger.incr("failed_columnar_upload")
return redirect("/columnartodatabaseview/form")
# Go back to welcome page / splash screen
message = _(
'Columnar file "%(columnar_filename)s" uploaded to table "%(table_name)s" '
'in database "%(db_name)s"',
columnar_filename=[file.filename for file in form.columnar_file.data],
table_name=str(columnar_table),
db_name=sqla_table.database.database_name,
)
flash(message, "info")
event_logger.log_with_context(
action="successful_columnar_upload",
database=form.database.data.name,
schema=form.schema.data,
table=form.name.data,
)
return redirect("/tablemodelview/list/")

View File

@ -1,237 +0,0 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# isort:skip_file
"""Unit tests for Superset CSV upload"""
import json
import logging
import os
import shutil
from typing import Optional
from unittest import mock
import pandas as pd
import pytest
import superset.utils.database
from superset.sql_parse import Table
from tests.integration_tests.conftest import ADMIN_SCHEMA_NAME # noqa: F401
from superset import db
from superset import security_manager
from superset.models.core import Database
from superset.utils import core as utils
from tests.integration_tests.test_app import app, login
from tests.integration_tests.base_tests import get_resp, SupersetTestCase
logger = logging.getLogger(__name__)
test_client = app.test_client()
CSV_UPLOAD_DATABASE = "csv_explore_db"
EXCEL_FILENAME = "testExcel.xlsx"
PARQUET_FILENAME1 = "testZip/testParquet1.parquet"
PARQUET_FILENAME2 = "testZip/testParquet2.parquet"
ZIP_DIRNAME = "testZip"
ZIP_FILENAME = "testZip.zip"
EXCEL_UPLOAD_TABLE = "excel_upload"
CSV_UPLOAD_TABLE = "csv_upload"
PARQUET_UPLOAD_TABLE = "parquet_upload"
CSV_UPLOAD_TABLE_W_SCHEMA = "csv_upload_w_schema"
CSV_UPLOAD_TABLE_W_EXPLORE = "csv_upload_w_explore"
def _setup_csv_upload():
upload_db = superset.utils.database.get_or_create_db(
CSV_UPLOAD_DATABASE, app.config["SQLALCHEMY_EXAMPLES_URI"]
)
extra = upload_db.get_extra()
extra["explore_database_id"] = superset.utils.database.get_example_database().id
upload_db.extra = json.dumps(extra)
upload_db.allow_file_upload = True
db.session.commit()
yield
upload_db = get_upload_db()
with upload_db.get_sqla_engine() as engine:
engine.execute(f"DROP TABLE IF EXISTS {EXCEL_UPLOAD_TABLE}")
engine.execute(f"DROP TABLE IF EXISTS {CSV_UPLOAD_TABLE}")
engine.execute(f"DROP TABLE IF EXISTS {PARQUET_UPLOAD_TABLE}")
engine.execute(f"DROP TABLE IF EXISTS {CSV_UPLOAD_TABLE_W_SCHEMA}")
engine.execute(f"DROP TABLE IF EXISTS {CSV_UPLOAD_TABLE_W_EXPLORE}")
db.session.delete(upload_db)
db.session.commit()
@pytest.fixture(scope="module")
def setup_csv_upload(login_as_admin):
yield from _setup_csv_upload()
@pytest.fixture(scope="module")
def setup_csv_upload_with_context():
with app.app_context():
login(test_client, username="admin")
yield from _setup_csv_upload()
@pytest.fixture()
def create_columnar_files():
os.mkdir(ZIP_DIRNAME)
pd.DataFrame({"a": ["john", "paul"], "b": [1, 2]}).to_parquet(PARQUET_FILENAME1)
pd.DataFrame({"a": ["max", "bob"], "b": [3, 4]}).to_parquet(PARQUET_FILENAME2)
shutil.make_archive(ZIP_DIRNAME, "zip", ZIP_DIRNAME)
yield
os.remove(ZIP_FILENAME)
shutil.rmtree(ZIP_DIRNAME)
def get_upload_db():
return db.session.query(Database).filter_by(database_name=CSV_UPLOAD_DATABASE).one()
def upload_columnar(
filename: str, table_name: str, extra: Optional[dict[str, str]] = None
):
columnar_upload_db_id = get_upload_db().id
form_data = {
"columnar_file": open(filename, "rb"),
"name": table_name,
"database": columnar_upload_db_id,
"if_exists": "fail",
"index_label": "test_label",
}
if schema := utils.get_example_default_schema():
form_data["schema"] = schema
if extra:
form_data.update(extra)
return get_resp(test_client, "/columnartodatabaseview/form", data=form_data)
def mock_upload_to_s3(filename: str, upload_prefix: str, table: Table) -> str:
"""
HDFS is used instead of S3 for the unit tests.integration_tests.
:param filename: The file to upload
:param upload_prefix: The S3 prefix
:param table: The table that will be created
:returns: The HDFS path to the directory with external table files
"""
# only needed for the hive tests
import docker
client = docker.from_env() # type: ignore
container = client.containers.get("namenode")
# docker mounted volume that contains csv uploads
src = os.path.join("/tmp/superset_uploads", os.path.basename(filename))
# hdfs destination for the external tables
dest_dir = os.path.join("/tmp/external/superset_uploads/", str(table))
container.exec_run(f"hdfs dfs -mkdir -p {dest_dir}")
dest = os.path.join(dest_dir, os.path.basename(filename))
container.exec_run(f"hdfs dfs -put {src} {dest}")
# hive external table expects a directory for the location
return dest_dir
def escaped_double_quotes(text):
return rf"\&#34;{text}\&#34;"
def escaped_parquet(text):
return escaped_double_quotes(f"[&#39;{text}&#39;]")
@pytest.mark.usefixtures("setup_csv_upload_with_context")
@pytest.mark.usefixtures("create_columnar_files")
@mock.patch("superset.db_engine_specs.hive.upload_to_s3", mock_upload_to_s3)
@mock.patch("superset.views.database.views.event_logger.log_with_context")
def test_import_parquet(mock_event_logger):
if utils.backend() == "hive":
pytest.skip("Hive doesn't allow parquet upload.")
schema = utils.get_example_default_schema()
full_table_name = (
f"{schema}.{PARQUET_UPLOAD_TABLE}" if schema else PARQUET_UPLOAD_TABLE
)
test_db = get_upload_db()
success_msg_f1 = f"Columnar file {escaped_parquet(PARQUET_FILENAME1)} uploaded to table {escaped_double_quotes(full_table_name)}"
# initial upload with fail mode
resp = upload_columnar(PARQUET_FILENAME1, PARQUET_UPLOAD_TABLE)
assert success_msg_f1 in resp
# upload again with fail mode; should fail
fail_msg = f"Unable to upload Columnar file {escaped_parquet(PARQUET_FILENAME1)} to table {escaped_double_quotes(PARQUET_UPLOAD_TABLE)}"
resp = upload_columnar(PARQUET_FILENAME1, PARQUET_UPLOAD_TABLE)
assert fail_msg in resp
if utils.backend() != "hive":
# upload again with append mode
resp = upload_columnar(
PARQUET_FILENAME1, PARQUET_UPLOAD_TABLE, extra={"if_exists": "append"}
)
assert success_msg_f1 in resp
mock_event_logger.assert_called_with(
action="successful_columnar_upload",
database=test_db.name,
schema=schema,
table=PARQUET_UPLOAD_TABLE,
)
# upload again with replace mode and specific columns
resp = upload_columnar(
PARQUET_FILENAME1,
PARQUET_UPLOAD_TABLE,
extra={"if_exists": "replace", "usecols": '["a"]'},
)
assert success_msg_f1 in resp
table = SupersetTestCase.get_table(name=PARQUET_UPLOAD_TABLE, schema=None)
# make sure only specified column name was read
assert "b" not in table.column_names
# ensure user is assigned as an owner
assert security_manager.find_user("admin") in table.owners
# upload again with replace mode
resp = upload_columnar(
PARQUET_FILENAME1, PARQUET_UPLOAD_TABLE, extra={"if_exists": "replace"}
)
assert success_msg_f1 in resp
with test_db.get_sqla_engine() as engine:
data = engine.execute(
f"SELECT * from {PARQUET_UPLOAD_TABLE} ORDER BY b"
).fetchall()
assert data == [("john", 1), ("paul", 2)]
# replace table with zip file
resp = upload_columnar(
ZIP_FILENAME, PARQUET_UPLOAD_TABLE, extra={"if_exists": "replace"}
)
success_msg_f2 = f"Columnar file {escaped_parquet(ZIP_FILENAME)} uploaded to table {escaped_double_quotes(full_table_name)}"
assert success_msg_f2 in resp
with test_db.get_sqla_engine() as engine:
data = engine.execute(
f"SELECT * from {PARQUET_UPLOAD_TABLE} ORDER BY b"
).fetchall()
assert data == [("john", 1), ("paul", 2), ("max", 3), ("bob", 4)]

View File

@ -1448,6 +1448,7 @@ class TestDatabaseApi(SupersetTestCase):
assert rv.status_code == 200
assert set(data["permissions"]) == {
"can_read",
"can_columnar_upload",
"can_csv_upload",
"can_excel_upload",
"can_write",

View File

@ -138,6 +138,38 @@ def test_csv_upload_dataset():
assert security_manager.find_user("admin") in dataset.owners
@pytest.mark.usefixtures("setup_csv_upload_with_context")
def test_csv_upload_with_index():
admin_user = security_manager.find_user(username="admin")
upload_database = get_upload_db()
with override_user(admin_user):
UploadCommand(
upload_database.id,
CSV_UPLOAD_TABLE,
create_csv_file(CSV_FILE_1),
None,
CSVReader({"dataframe_index": True, "index_label": "id"}),
).run()
with upload_database.get_sqla_engine() as engine:
data = engine.execute(f"SELECT * from {CSV_UPLOAD_TABLE}").fetchall()
assert data == [
(0, "name1", 30, "city1", "1-1-1980"),
(1, "name2", 29, "city2", "1-1-1981"),
(2, "name3", 28, "city3", "1-1-1982"),
]
# assert column names
assert [
col for col in engine.execute(f"SELECT * from {CSV_UPLOAD_TABLE}").keys()
] == [
"id",
"Name",
"Age",
"City",
"Birth",
]
@only_postgresql
@pytest.mark.usefixtures("setup_csv_upload_with_context")
def test_csv_upload_database_not_found():

View File

@ -0,0 +1,253 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import io
import tempfile
from typing import Any
from zipfile import ZipFile
import numpy as np
import pytest
from werkzeug.datastructures import FileStorage
from superset.commands.database.exceptions import DatabaseUploadFailed
from superset.commands.database.uploaders.columnar_reader import (
ColumnarReader,
ColumnarReaderOptions,
)
from tests.unit_tests.fixtures.common import create_columnar_file
COLUMNAR_DATA: dict[str, list[Any]] = {
"Name": ["name1", "name2", "name3"],
"Age": [30, 25, 20],
"City": ["city1", "city2", "city3"],
"Birth": ["1990-02-01", "1995-02-01", "2000-02-01"],
}
COLUMNAR_WITH_NULLS: dict[str, list[Any]] = {
"Name": ["name1", "name2", "name3"],
"Age": [None, 25, 20],
"City": ["city1", None, "city3"],
"Birth": ["1990-02-01", "1995-02-01", "2000-02-01"],
}
COLUMNAR_WITH_FLOATS: dict[str, list[Any]] = {
"Name": ["name1", "name2", "name3"],
"Age": [30.1, 25.1, 20.1],
"City": ["city1", "city2", "city3"],
"Birth": ["1990-02-01", "1995-02-01", "2000-02-01"],
}
@pytest.mark.parametrize(
"file, options, expected_cols, expected_values",
[
(
create_columnar_file(COLUMNAR_DATA),
ColumnarReaderOptions(),
["Name", "Age", "City", "Birth"],
[
["name1", 30, "city1", "1990-02-01"],
["name2", 25, "city2", "1995-02-01"],
["name3", 20, "city3", "2000-02-01"],
],
),
(
create_columnar_file(COLUMNAR_DATA),
ColumnarReaderOptions(
columns_read=["Name", "Age"],
),
["Name", "Age"],
[
["name1", 30],
["name2", 25],
["name3", 20],
],
),
(
create_columnar_file(COLUMNAR_DATA),
ColumnarReaderOptions(
columns_read=[],
),
["Name", "Age", "City", "Birth"],
[
["name1", 30, "city1", "1990-02-01"],
["name2", 25, "city2", "1995-02-01"],
["name3", 20, "city3", "2000-02-01"],
],
),
(
create_columnar_file(COLUMNAR_WITH_NULLS),
ColumnarReaderOptions(),
["Name", "Age", "City", "Birth"],
[
["name1", np.nan, "city1", "1990-02-01"],
["name2", 25, None, "1995-02-01"],
["name3", 20, "city3", "2000-02-01"],
],
),
(
create_columnar_file(COLUMNAR_WITH_FLOATS),
ColumnarReaderOptions(),
["Name", "Age", "City", "Birth"],
[
["name1", 30.1, "city1", "1990-02-01"],
["name2", 25.1, "city2", "1995-02-01"],
["name3", 20.1, "city3", "2000-02-01"],
],
),
],
)
def test_columnar_reader_file_to_dataframe(
file, options, expected_cols, expected_values
):
reader = ColumnarReader(
options=options,
)
df = reader.file_to_dataframe(file)
assert df.columns.tolist() == expected_cols
actual_values = df.values.tolist()
for i in range(len(expected_values)):
for j in range(len(expected_values[i])):
expected_val = expected_values[i][j]
actual_val = actual_values[i][j]
# Check if both values are NaN
if isinstance(expected_val, float) and isinstance(actual_val, float):
assert np.isnan(expected_val) == np.isnan(actual_val)
else:
assert expected_val == actual_val
file.close()
def test_excel_reader_wrong_columns_to_read():
reader = ColumnarReader(
options=ColumnarReaderOptions(columns_read=["xpto"]),
)
with pytest.raises(DatabaseUploadFailed) as ex:
reader.file_to_dataframe(create_columnar_file(COLUMNAR_DATA))
assert (
str(ex.value)
== (
"Parsing error: No match for FieldRef.Name(xpto) in Name: string\n"
"Age: int64\n"
"City: string\n"
"Birth: string\n"
"__fragment_index: int32\n"
"__batch_index: int32\n"
"__last_in_fragment: bool\n"
"__filename: string"
)
!= (
"Parsing error: Usecols do not match columns, columns expected but not found: "
"['xpto'] (sheet: 0)"
)
)
def test_columnar_reader_invalid_file():
reader = ColumnarReader(
options=ColumnarReaderOptions(),
)
with pytest.raises(DatabaseUploadFailed) as ex:
reader.file_to_dataframe(FileStorage(io.BytesIO(b"c1"), "test.parquet"))
assert str(ex.value) == (
"Parsing error: Could not open Parquet input source '<Buffer>': Parquet file "
"size is 2 bytes, smaller than the minimum file footer (8 bytes)"
)
def test_columnar_reader_zip():
reader = ColumnarReader(
options=ColumnarReaderOptions(),
)
file1 = create_columnar_file(COLUMNAR_DATA, "test1.parquet")
file2 = create_columnar_file(COLUMNAR_DATA, "test2.parquet")
with tempfile.NamedTemporaryFile(delete=False) as tmp_file1:
tmp_file1.write(file1.read())
tmp_file1.seek(0)
with tempfile.NamedTemporaryFile(delete=False) as tmp_file2:
tmp_file2.write(file2.read())
tmp_file2.seek(0)
with tempfile.NamedTemporaryFile(delete=False) as tmp_zip:
with ZipFile(tmp_zip, "w") as zip_file:
zip_file.write(tmp_file1.name, "test1.parquet")
zip_file.write(tmp_file2.name, "test2.parquet")
tmp_zip.seek(0) # Reset file pointer to beginning
df = reader.file_to_dataframe(FileStorage(tmp_zip, "test.zip"))
assert df.columns.tolist() == ["Name", "Age", "City", "Birth"]
assert df.values.tolist() == [
["name1", 30, "city1", "1990-02-01"],
["name2", 25, "city2", "1995-02-01"],
["name3", 20, "city3", "2000-02-01"],
["name1", 30, "city1", "1990-02-01"],
["name2", 25, "city2", "1995-02-01"],
["name3", 20, "city3", "2000-02-01"],
]
def test_columnar_reader_bad_parquet_in_zip():
reader = ColumnarReader(
options=ColumnarReaderOptions(),
)
with tempfile.NamedTemporaryFile(delete=False) as tmp_zip:
with ZipFile(tmp_zip, "w") as zip_file:
zip_file.writestr("test1.parquet", b"bad parquet file")
zip_file.writestr("test2.parquet", b"bad parquet file")
tmp_zip.seek(0) # Reset file pointer to beginning
with pytest.raises(DatabaseUploadFailed) as ex:
reader.file_to_dataframe(FileStorage(tmp_zip, "test.zip"))
assert str(ex.value) == (
"Parsing error: Could not open Parquet input source '<Buffer>': "
"Parquet magic bytes not found in footer. "
"Either the file is corrupted or this is not a parquet file."
)
def test_columnar_reader_bad_zip():
reader = ColumnarReader(
options=ColumnarReaderOptions(),
)
with pytest.raises(DatabaseUploadFailed) as ex:
reader.file_to_dataframe(FileStorage(io.BytesIO(b"bad zip file"), "test.zip"))
assert str(ex.value) == "Not a valid ZIP file"
def test_columnar_reader_metadata():
reader = ColumnarReader(
options=ColumnarReaderOptions(),
)
file = create_columnar_file(COLUMNAR_DATA)
metadata = reader.file_metadata(file)
column_names = sorted(metadata["items"][0]["column_names"])
assert column_names == ["Age", "Birth", "City", "Name"]
assert metadata["items"][0]["sheet_name"] is None
def test_columnar_reader_metadata_invalid_file():
reader = ColumnarReader(
options=ColumnarReaderOptions(),
)
with pytest.raises(DatabaseUploadFailed) as ex:
reader.file_metadata(FileStorage(io.BytesIO(b"c1"), "test.parquet"))
assert str(ex.value) == (
"Parsing error: Parquet file size is 2 bytes, "
"smaller than the minimum file footer (8 bytes)"
)

View File

@ -19,6 +19,7 @@ from datetime import datetime
import numpy as np
import pytest
from werkzeug.datastructures import FileStorage
from superset.commands.database.exceptions import DatabaseUploadFailed
from superset.commands.database.uploaders.csv_reader import CSVReader, CSVReaderOptions
@ -265,6 +266,23 @@ def test_csv_reader_file_to_dataframe(file, options, expected_cols, expected_val
file.close()
def test_csv_reader_index_column():
csv_reader = CSVReader(
options=CSVReaderOptions(index_column="Name"),
)
df = csv_reader.file_to_dataframe(create_csv_file(CSV_DATA))
assert df.index.name == "Name"
def test_csv_reader_wrong_index_column():
csv_reader = CSVReader(
options=CSVReaderOptions(index_column="wrong"),
)
with pytest.raises(DatabaseUploadFailed) as ex:
csv_reader.file_to_dataframe(create_csv_file(CSV_DATA))
assert str(ex.value) == "Parsing error: Index wrong invalid"
def test_csv_reader_broken_file_no_columns():
csv_reader = CSVReader(
options=CSVReaderOptions(),
@ -292,7 +310,9 @@ def test_csv_reader_invalid_file():
)
with pytest.raises(DatabaseUploadFailed) as ex:
csv_reader.file_to_dataframe(
io.StringIO("c1,c2,c3\na,b,c\n1,2,3,4,5,6,7\n1,2,3")
FileStorage(
io.StringIO("c1,c2,c3\na,b,c\n1,2,3,4,5,6,7\n1,2,3"), filename=""
)
)
assert str(ex.value) == (
"Parsing error: Error tokenizing data. C error:"
@ -306,8 +326,48 @@ def test_csv_reader_invalid_encoding():
)
binary_data = b"col1,col2,col3\nv1,v2,\xba\nv3,v4,v5\n"
with pytest.raises(DatabaseUploadFailed) as ex:
csv_reader.file_to_dataframe(io.BytesIO(binary_data))
csv_reader.file_to_dataframe(FileStorage(io.BytesIO(binary_data)))
assert str(ex.value) == (
"Parsing error: 'utf-8' codec can't decode byte 0xba in"
" position 21: invalid start byte"
)
def test_csv_reader_file_metadata():
csv_reader = CSVReader(
options=CSVReaderOptions(),
)
file = create_csv_file(CSV_DATA)
metadata = csv_reader.file_metadata(file)
assert metadata == {
"items": [
{"column_names": ["Name", "Age", "City", "Birth"], "sheet_name": None}
]
}
file.close()
file = create_csv_file(CSV_DATA, delimiter="|")
csv_reader = CSVReader(
options=CSVReaderOptions(delimiter="|"),
)
metadata = csv_reader.file_metadata(file)
assert metadata == {
"items": [
{"column_names": ["Name", "Age", "City", "Birth"], "sheet_name": None}
]
}
file.close()
def test_csv_reader_file_metadata_invalid_file():
csv_reader = CSVReader(
options=CSVReaderOptions(),
)
with pytest.raises(DatabaseUploadFailed) as ex:
csv_reader.file_metadata(
FileStorage(io.StringIO("c1,c2,c3\na,b,c\n1,2,3,4,5,6,7\n1,2,3"))
)
assert str(ex.value) == (
"Parsing error: Error tokenizing data. C error:"
" Expected 3 fields in line 3, saw 7\n"
)

View File

@ -20,6 +20,9 @@ from typing import Any
import numpy as np
import pytest
import xlsxwriter
from werkzeug.datastructures import FileStorage
from xlsxwriter.workbook import Worksheet
from superset.commands.database.exceptions import DatabaseUploadFailed
from superset.commands.database.uploaders.excel_reader import (
@ -50,6 +53,18 @@ EXCEL_DATA_DECIMAL_CHAR = {
}
def write_data_to_worksheet(
worksheet: Worksheet, header: list[str], data: list[list[Any]]
):
all_data = [header] + data
row = 0
col = 0
for name, age in all_data:
worksheet.write(row, col, name)
worksheet.write(row, col + 1, age)
row += 1
@pytest.mark.parametrize(
"file, options, expected_cols, expected_values",
[
@ -175,6 +190,23 @@ def test_excel_reader_file_to_dataframe(file, options, expected_cols, expected_v
file.close()
def test_excel_reader_index_column():
excel_reader = ExcelReader(
options=ExcelReaderOptions(index_column="Name"),
)
df = excel_reader.file_to_dataframe(create_excel_file(EXCEL_DATA))
assert df.index.name == "Name"
def test_excel_reader_wrong_index_column():
excel_reader = ExcelReader(
options=ExcelReaderOptions(index_column="wrong"),
)
with pytest.raises(DatabaseUploadFailed) as ex:
excel_reader.file_to_dataframe(create_excel_file(EXCEL_DATA))
assert str(ex.value) == ("Parsing error: Index wrong invalid (sheet: 0)")
def test_excel_reader_wrong_columns_to_read():
excel_reader = ExcelReader(
options=ExcelReaderOptions(columns_read=["xpto"]),
@ -203,7 +235,60 @@ def test_excel_reader_invalid_file():
options=ExcelReaderOptions(),
)
with pytest.raises(DatabaseUploadFailed) as ex:
excel_reader.file_to_dataframe(io.StringIO("c1"))
excel_reader.file_to_dataframe(FileStorage(io.BytesIO(b"c1")))
assert str(ex.value) == (
"Parsing error: Excel file format cannot be determined, you must specify an engine manually."
)
def test_excel_reader_metadata():
excel_reader = ExcelReader(
options=ExcelReaderOptions(),
)
file = create_excel_file(EXCEL_DATA)
metadata = excel_reader.file_metadata(file)
assert metadata == {
"items": [
{"column_names": ["Name", "Age", "City", "Birth"], "sheet_name": "Sheet1"}
]
}
file.close()
def test_excel_reader_metadata_mul_sheets():
buffer = io.BytesIO()
workbook = xlsxwriter.Workbook(buffer)
worksheet1 = workbook.add_worksheet("Sheet1")
header1 = ["col11", "col12"]
data1 = [["v11", "v12"]]
write_data_to_worksheet(worksheet1, header1, data1)
worksheet2 = workbook.add_worksheet("Sheet2")
header2 = ["col21", "col22"]
data2 = [["v21", "v22"]]
write_data_to_worksheet(worksheet2, header2, data2)
workbook.close()
file = FileStorage(stream=buffer, filename="test.xls")
excel_reader = ExcelReader(
options=ExcelReaderOptions(),
)
metadata = excel_reader.file_metadata(file)
assert metadata == {
"items": [
{"column_names": ["col11", "col12"], "sheet_name": "Sheet1"},
{"column_names": ["col21", "col22"], "sheet_name": "Sheet2"},
]
}
file.close()
def test_excel_reader_file_metadata_invalid_file():
excel_reader = ExcelReader(
options=ExcelReaderOptions(),
)
with pytest.raises(DatabaseUploadFailed) as ex:
excel_reader.file_metadata(FileStorage(io.BytesIO(b"1")))
assert str(ex.value) == ("Excel file format cannot be determined")

View File

@ -34,13 +34,18 @@ from sqlalchemy.orm.session import Session
from superset import db
from superset.commands.database.uploaders.base import UploadCommand
from superset.commands.database.uploaders.columnar_reader import ColumnarReader
from superset.commands.database.uploaders.csv_reader import CSVReader
from superset.commands.database.uploaders.excel_reader import ExcelReader
from superset.db_engine_specs.sqlite import SqliteEngineSpec
from superset.errors import ErrorLevel, SupersetError, SupersetErrorType
from superset.exceptions import SupersetSecurityException
from superset.sql_parse import Table
from tests.unit_tests.fixtures.common import create_csv_file, create_excel_file
from tests.unit_tests.fixtures.common import (
create_columnar_file,
create_csv_file,
create_excel_file,
)
def test_filter_by_uuid(
@ -940,7 +945,7 @@ def test_csv_upload(
data=payload,
content_type="multipart/form-data",
)
assert response.status_code == 200
assert response.status_code == 201
assert response.json == {"message": "OK"}
init_mock.assert_called_with(*upload_called_with)
reader_mock.assert_called_with(*reader_called_with)
@ -1135,7 +1140,7 @@ def test_csv_upload_file_extension_invalid(
response = client.post(
"/api/v1/database/1/csv_upload/",
data={
"file": (create_csv_file(), filename),
"file": create_csv_file(filename=filename),
"table_name": "table1",
"delimiter": ",",
},
@ -1171,13 +1176,13 @@ def test_csv_upload_file_extension_valid(
response = client.post(
"/api/v1/database/1/csv_upload/",
data={
"file": (create_csv_file(), filename),
"file": create_csv_file(filename=filename),
"table_name": "table1",
"delimiter": ",",
},
content_type="multipart/form-data",
)
assert response.status_code == 200
assert response.status_code == 201
@pytest.mark.parametrize(
@ -1282,7 +1287,7 @@ def test_excel_upload(
data=payload,
content_type="multipart/form-data",
)
assert response.status_code == 200
assert response.status_code == 201
assert response.json == {"message": "OK"}
init_mock.assert_called_with(*upload_called_with)
reader_mock.assert_called_with(*reader_called_with)
@ -1406,7 +1411,7 @@ def test_excel_upload_file_extension_invalid(
response = client.post(
"/api/v1/database/1/excel_upload/",
data={
"file": (create_excel_file(), filename),
"file": create_excel_file(filename=filename),
"table_name": "table1",
},
content_type="multipart/form-data",
@ -1415,6 +1420,326 @@ def test_excel_upload_file_extension_invalid(
assert response.json == {"message": {"file": ["File extension is not allowed."]}}
@pytest.mark.parametrize(
"payload,upload_called_with,reader_called_with",
[
(
{
"file": (create_columnar_file(), "out.parquet"),
"table_name": "table1",
},
(
1,
"table1",
ANY,
None,
ANY,
),
(
{
"already_exists": "fail",
"file": ANY,
"table_name": "table1",
},
),
),
(
{
"file": (create_columnar_file(), "out.parquet"),
"table_name": "table2",
"already_exists": "replace",
"columns_read": "col1,col2",
"dataframe_index": True,
"index_label": "label",
},
(
1,
"table2",
ANY,
None,
ANY,
),
(
{
"already_exists": "replace",
"columns_read": ["col1", "col2"],
"file": ANY,
"table_name": "table2",
"dataframe_index": True,
"index_label": "label",
},
),
),
],
)
def test_columnar_upload(
payload: dict[str, Any],
upload_called_with: tuple[int, str, Any, dict[str, Any]],
reader_called_with: dict[str, Any],
mocker: MockFixture,
client: Any,
full_api_access: None,
) -> None:
"""
Test Excel Upload success.
"""
init_mock = mocker.patch.object(UploadCommand, "__init__")
init_mock.return_value = None
_ = mocker.patch.object(UploadCommand, "run")
reader_mock = mocker.patch.object(ColumnarReader, "__init__")
reader_mock.return_value = None
response = client.post(
"/api/v1/database/1/columnar_upload/",
data=payload,
content_type="multipart/form-data",
)
assert response.status_code == 201
assert response.json == {"message": "OK"}
init_mock.assert_called_with(*upload_called_with)
reader_mock.assert_called_with(*reader_called_with)
@pytest.mark.parametrize(
"payload,expected_response",
[
(
{
"file": (create_columnar_file(), "out.parquet"),
"already_exists": "fail",
},
{"message": {"table_name": ["Missing data for required field."]}},
),
(
{
"file": (create_columnar_file(), "out.parquet"),
"table_name": "",
"already_exists": "fail",
},
{"message": {"table_name": ["Length must be between 1 and 10000."]}},
),
(
{"table_name": "table1", "already_exists": "fail"},
{"message": {"file": ["Field may not be null."]}},
),
(
{
"file": "xpto",
"table_name": "table1",
"already_exists": "fail",
},
{"message": {"file": ["Field may not be null."]}},
),
(
{
"file": (create_columnar_file(), "out.parquet"),
"table_name": "table1",
"already_exists": "xpto",
},
{"message": {"already_exists": ["Must be one of: fail, replace, append."]}},
),
],
)
def test_columnar_upload_validation(
payload: Any,
expected_response: dict[str, str],
mocker: MockFixture,
client: Any,
full_api_access: None,
) -> None:
"""
Test Excel Upload validation fails.
"""
_ = mocker.patch.object(UploadCommand, "run")
response = client.post(
"/api/v1/database/1/columnar_upload/",
data=payload,
content_type="multipart/form-data",
)
assert response.status_code == 400
assert response.json == expected_response
@pytest.mark.parametrize(
"filename",
[
"out.parquet",
"out.zip",
"out.parquet.zip",
"out something.parquet",
"out something.zip",
],
)
def test_columnar_upload_file_extension_valid(
filename: str,
mocker: MockFixture,
client: Any,
full_api_access: None,
) -> None:
"""
Test Excel Upload file extension fails.
"""
_ = mocker.patch.object(UploadCommand, "run")
response = client.post(
"/api/v1/database/1/columnar_upload/",
data={
"file": (create_columnar_file(), filename),
"table_name": "table1",
},
content_type="multipart/form-data",
)
assert response.status_code == 201
@pytest.mark.parametrize(
"filename",
[
"out.xpto",
"out.exe",
"out",
"out zip",
"",
"out.parquet.exe",
".parquet",
"out.",
".",
"out parquet a.exe",
],
)
def test_columnar_upload_file_extension_invalid(
filename: str,
mocker: MockFixture,
client: Any,
full_api_access: None,
) -> None:
"""
Test Excel Upload file extension fails.
"""
_ = mocker.patch.object(UploadCommand, "run")
response = client.post(
"/api/v1/database/1/columnar_upload/",
data={
"file": create_columnar_file(filename=filename),
"table_name": "table1",
},
content_type="multipart/form-data",
)
assert response.status_code == 400
assert response.json == {"message": {"file": ["File extension is not allowed."]}}
def test_csv_metadata(mocker: MockFixture, client: Any, full_api_access: None) -> None:
_ = mocker.patch.object(CSVReader, "file_metadata")
response = client.post(
"/api/v1/database/csv_metadata/",
data={"file": create_csv_file()},
content_type="multipart/form-data",
)
assert response.status_code == 200
def test_csv_metadata_bad_extension(
mocker: MockFixture, client: Any, full_api_access: None
) -> None:
_ = mocker.patch.object(CSVReader, "file_metadata")
response = client.post(
"/api/v1/database/csv_metadata/",
data={"file": create_csv_file(filename="test.out")},
content_type="multipart/form-data",
)
assert response.status_code == 400
assert response.json == {"message": {"file": ["File extension is not allowed."]}}
def test_csv_metadata_validation(
mocker: MockFixture, client: Any, full_api_access: None
) -> None:
_ = mocker.patch.object(CSVReader, "file_metadata")
response = client.post(
"/api/v1/database/csv_metadata/",
data={},
content_type="multipart/form-data",
)
assert response.status_code == 400
assert response.json == {"message": {"file": ["Field may not be null."]}}
def test_excel_metadata(
mocker: MockFixture, client: Any, full_api_access: None
) -> None:
_ = mocker.patch.object(ExcelReader, "file_metadata")
response = client.post(
"/api/v1/database/excel_metadata/",
data={"file": create_excel_file()},
content_type="multipart/form-data",
)
assert response.status_code == 200
def test_excel_metadata_bad_extension(
mocker: MockFixture, client: Any, full_api_access: None
) -> None:
_ = mocker.patch.object(ExcelReader, "file_metadata")
response = client.post(
"/api/v1/database/excel_metadata/",
data={"file": create_excel_file(filename="test.out")},
content_type="multipart/form-data",
)
assert response.status_code == 400
assert response.json == {"message": {"file": ["File extension is not allowed."]}}
def test_excel_metadata_validation(
mocker: MockFixture, client: Any, full_api_access: None
) -> None:
_ = mocker.patch.object(ExcelReader, "file_metadata")
response = client.post(
"/api/v1/database/excel_metadata/",
data={},
content_type="multipart/form-data",
)
assert response.status_code == 400
assert response.json == {"message": {"file": ["Field may not be null."]}}
def test_columnar_metadata(
mocker: MockFixture, client: Any, full_api_access: None
) -> None:
_ = mocker.patch.object(ColumnarReader, "file_metadata")
response = client.post(
"/api/v1/database/columnar_metadata/",
data={"file": create_columnar_file()},
content_type="multipart/form-data",
)
assert response.status_code == 200
def test_columnar_metadata_bad_extension(
mocker: MockFixture, client: Any, full_api_access: None
) -> None:
_ = mocker.patch.object(ColumnarReader, "file_metadata")
response = client.post(
"/api/v1/database/columnar_metadata/",
data={"file": create_columnar_file(filename="test.out")},
content_type="multipart/form-data",
)
assert response.status_code == 400
assert response.json == {"message": {"file": ["File extension is not allowed."]}}
def test_columnar_metadata_validation(
mocker: MockFixture, client: Any, full_api_access: None
) -> None:
_ = mocker.patch.object(ColumnarReader, "file_metadata")
response = client.post(
"/api/v1/database/columnar_metadata/",
data={},
content_type="multipart/form-data",
)
assert response.status_code == 400
assert response.json == {"message": {"file": ["Field may not be null."]}}
def test_table_metadata_happy_path(
mocker: MockFixture,
client: Any,

View File

@ -24,6 +24,7 @@ from typing import Any
import pandas as pd
import pytest
from werkzeug.datastructures import FileStorage
@pytest.fixture
@ -31,7 +32,9 @@ def dttm() -> datetime:
return datetime.strptime("2019-01-02 03:04:05.678900", "%Y-%m-%d %H:%M:%S.%f")
def create_csv_file(data: list[list[str]] | None = None, delimiter=",") -> BytesIO:
def create_csv_file(
data: list[list[str]] | None = None, delimiter=",", filename="test.csv"
) -> FileStorage:
data = (
[
["Name", "Age", "City"],
@ -46,14 +49,27 @@ def create_csv_file(data: list[list[str]] | None = None, delimiter=",") -> Bytes
for row in data:
writer.writerow(row)
output.seek(0)
bytes_buffer = BytesIO(output.getvalue().encode("utf-8"))
return bytes_buffer
buffer = BytesIO(output.getvalue().encode("utf-8"))
return FileStorage(stream=buffer, filename=filename)
def create_excel_file(data: dict[str, list[Any]] | None = None) -> BytesIO:
def create_excel_file(
data: dict[str, list[Any]] | None = None, filename="test.xls"
) -> FileStorage:
data = {"Name": ["John"], "Age": [30], "City": ["New York"]} if not data else data
excel_buffer = BytesIO()
buffer = BytesIO()
df = pd.DataFrame(data)
df.to_excel(excel_buffer, index=False)
excel_buffer.seek(0)
return excel_buffer
df.to_excel(buffer, index=False)
buffer.seek(0)
return FileStorage(stream=buffer, filename=filename)
def create_columnar_file(
data: dict[str, list[Any]] | None = None, filename="test.parquet"
) -> FileStorage:
data = {"Name": ["John"], "Age": [30], "City": ["New York"]} if not data else data
buffer = BytesIO()
df = pd.DataFrame(data)
df.to_parquet(buffer, index=False)
buffer.seek(0)
return FileStorage(stream=buffer, filename=filename)