295 lines
9.1 KiB
Python
295 lines
9.1 KiB
Python
# Licensed to the Apache Software Foundation (ASF) under one
|
|
# or more contributor license agreements. See the NOTICE file
|
|
# distributed with this work for additional information
|
|
# regarding copyright ownership. The ASF licenses this file
|
|
# to you under the Apache License, Version 2.0 (the
|
|
# "License"); you may not use this file except in compliance
|
|
# with the License. You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing,
|
|
# software distributed under the License is distributed on an
|
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
# KIND, either express or implied. See the License for the
|
|
# specific language governing permissions and limitations
|
|
# under the License.
|
|
import io
|
|
from datetime import datetime
|
|
from typing import Any
|
|
|
|
import numpy as np
|
|
import pytest
|
|
import xlsxwriter
|
|
from werkzeug.datastructures import FileStorage
|
|
from xlsxwriter.workbook import Worksheet
|
|
|
|
from superset.commands.database.exceptions import DatabaseUploadFailed
|
|
from superset.commands.database.uploaders.excel_reader import (
|
|
ExcelReader,
|
|
ExcelReaderOptions,
|
|
)
|
|
from tests.unit_tests.fixtures.common import create_excel_file
|
|
|
|
EXCEL_DATA: dict[str, list[Any]] = {
|
|
"Name": ["name1", "name2", "name3"],
|
|
"Age": [30, 25, 20],
|
|
"City": ["city1", "city2", "city3"],
|
|
"Birth": ["1990-02-01", "1995-02-01", "2000-02-01"],
|
|
}
|
|
|
|
EXCEL_WITH_NULLS: dict[str, list[Any]] = {
|
|
"Name": ["name1", "name2", "name3"],
|
|
"Age": ["N/A", 25, 20],
|
|
"City": ["city1", "None", "city3"],
|
|
"Birth": ["1990-02-01", "1995-02-01", "2000-02-01"],
|
|
}
|
|
|
|
EXCEL_DATA_DECIMAL_CHAR = {
|
|
"Name": ["name1"],
|
|
"Age": ["30,1"],
|
|
"City": ["city1"],
|
|
"Birth": ["1990-02-01"],
|
|
}
|
|
|
|
|
|
def write_data_to_worksheet(
|
|
worksheet: Worksheet, header: list[str], data: list[list[Any]]
|
|
):
|
|
all_data = [header] + data
|
|
row = 0
|
|
col = 0
|
|
for name, age in all_data:
|
|
worksheet.write(row, col, name)
|
|
worksheet.write(row, col + 1, age)
|
|
row += 1
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"file, options, expected_cols, expected_values",
|
|
[
|
|
(
|
|
create_excel_file(EXCEL_DATA),
|
|
ExcelReaderOptions(),
|
|
["Name", "Age", "City", "Birth"],
|
|
[
|
|
["name1", 30, "city1", "1990-02-01"],
|
|
["name2", 25, "city2", "1995-02-01"],
|
|
["name3", 20, "city3", "2000-02-01"],
|
|
],
|
|
),
|
|
(
|
|
create_excel_file(EXCEL_DATA),
|
|
ExcelReaderOptions(
|
|
columns_read=["Name", "Age"],
|
|
),
|
|
["Name", "Age"],
|
|
[
|
|
["name1", 30],
|
|
["name2", 25],
|
|
["name3", 20],
|
|
],
|
|
),
|
|
(
|
|
create_excel_file(EXCEL_DATA),
|
|
ExcelReaderOptions(
|
|
columns_read=[],
|
|
),
|
|
["Name", "Age", "City", "Birth"],
|
|
[
|
|
["name1", 30, "city1", "1990-02-01"],
|
|
["name2", 25, "city2", "1995-02-01"],
|
|
["name3", 20, "city3", "2000-02-01"],
|
|
],
|
|
),
|
|
(
|
|
create_excel_file(EXCEL_DATA),
|
|
ExcelReaderOptions(
|
|
rows_to_read=1,
|
|
),
|
|
["Name", "Age", "City", "Birth"],
|
|
[
|
|
["name1", 30.0, "city1", "1990-02-01"],
|
|
],
|
|
),
|
|
(
|
|
create_excel_file(EXCEL_DATA),
|
|
ExcelReaderOptions(
|
|
rows_to_read=1,
|
|
columns_read=["Name", "Age"],
|
|
),
|
|
["Name", "Age"],
|
|
[
|
|
["name1", 30.0],
|
|
],
|
|
),
|
|
(
|
|
create_excel_file(EXCEL_DATA),
|
|
ExcelReaderOptions(
|
|
skip_rows=1,
|
|
),
|
|
["name1", 30, "city1", "1990-02-01"],
|
|
[
|
|
["name2", 25.0, "city2", "1995-02-01"],
|
|
["name3", 20.0, "city3", "2000-02-01"],
|
|
],
|
|
),
|
|
(
|
|
create_excel_file(EXCEL_DATA),
|
|
ExcelReaderOptions(
|
|
column_dates=["Birth"],
|
|
),
|
|
["Name", "Age", "City", "Birth"],
|
|
[
|
|
["name1", 30, "city1", datetime(1990, 2, 1, 0, 0)],
|
|
["name2", 25, "city2", datetime(1995, 2, 1, 0, 0)],
|
|
["name3", 20, "city3", datetime(2000, 2, 1, 0, 0)],
|
|
],
|
|
),
|
|
(
|
|
create_excel_file(EXCEL_WITH_NULLS),
|
|
ExcelReaderOptions(
|
|
null_values=["N/A", "None"],
|
|
),
|
|
["Name", "Age", "City", "Birth"],
|
|
[
|
|
["name1", np.nan, "city1", "1990-02-01"],
|
|
["name2", 25.0, np.nan, "1995-02-01"],
|
|
["name3", 20.0, "city3", "2000-02-01"],
|
|
],
|
|
),
|
|
(
|
|
create_excel_file(EXCEL_DATA_DECIMAL_CHAR),
|
|
ExcelReaderOptions(
|
|
decimal_character=",",
|
|
),
|
|
["Name", "Age", "City", "Birth"],
|
|
[
|
|
["name1", 30.1, "city1", "1990-02-01"],
|
|
],
|
|
),
|
|
],
|
|
)
|
|
def test_excel_reader_file_to_dataframe(file, options, expected_cols, expected_values):
|
|
excel_reader = ExcelReader(
|
|
options=options,
|
|
)
|
|
df = excel_reader.file_to_dataframe(file)
|
|
assert df.columns.tolist() == expected_cols
|
|
actual_values = df.values.tolist()
|
|
for i in range(len(expected_values)):
|
|
for j in range(len(expected_values[i])):
|
|
expected_val = expected_values[i][j]
|
|
actual_val = actual_values[i][j]
|
|
|
|
# Check if both values are NaN
|
|
if isinstance(expected_val, float) and isinstance(actual_val, float):
|
|
assert np.isnan(expected_val) == np.isnan(actual_val)
|
|
else:
|
|
assert expected_val == actual_val
|
|
file.close()
|
|
|
|
|
|
def test_excel_reader_index_column():
|
|
excel_reader = ExcelReader(
|
|
options=ExcelReaderOptions(index_column="Name"),
|
|
)
|
|
df = excel_reader.file_to_dataframe(create_excel_file(EXCEL_DATA))
|
|
assert df.index.name == "Name"
|
|
|
|
|
|
def test_excel_reader_wrong_index_column():
|
|
excel_reader = ExcelReader(
|
|
options=ExcelReaderOptions(index_column="wrong"),
|
|
)
|
|
with pytest.raises(DatabaseUploadFailed) as ex:
|
|
excel_reader.file_to_dataframe(create_excel_file(EXCEL_DATA))
|
|
assert str(ex.value) == ("Parsing error: Index wrong invalid (sheet: 0)")
|
|
|
|
|
|
def test_excel_reader_wrong_columns_to_read():
|
|
excel_reader = ExcelReader(
|
|
options=ExcelReaderOptions(columns_read=["xpto"]),
|
|
)
|
|
with pytest.raises(DatabaseUploadFailed) as ex:
|
|
excel_reader.file_to_dataframe(create_excel_file(EXCEL_DATA))
|
|
assert str(ex.value) == (
|
|
"Parsing error: Usecols do not match columns, "
|
|
"columns expected but not found: ['xpto'] (sheet: 0)"
|
|
)
|
|
|
|
|
|
def test_excel_reader_wrong_date():
|
|
excel_reader = ExcelReader(
|
|
options=ExcelReaderOptions(column_dates=["xpto"]),
|
|
)
|
|
with pytest.raises(DatabaseUploadFailed) as ex:
|
|
excel_reader.file_to_dataframe(create_excel_file(EXCEL_DATA))
|
|
assert str(ex.value) == (
|
|
"Parsing error: Missing column provided to 'parse_dates':" " 'xpto' (sheet: 0)"
|
|
)
|
|
|
|
|
|
def test_excel_reader_invalid_file():
|
|
excel_reader = ExcelReader(
|
|
options=ExcelReaderOptions(),
|
|
)
|
|
with pytest.raises(DatabaseUploadFailed) as ex:
|
|
excel_reader.file_to_dataframe(FileStorage(io.BytesIO(b"c1")))
|
|
assert str(ex.value) == (
|
|
"Parsing error: Excel file format cannot be determined, you must specify an engine manually." # noqa: E501
|
|
)
|
|
|
|
|
|
def test_excel_reader_metadata():
|
|
excel_reader = ExcelReader(
|
|
options=ExcelReaderOptions(),
|
|
)
|
|
file = create_excel_file(EXCEL_DATA)
|
|
metadata = excel_reader.file_metadata(file)
|
|
assert metadata == {
|
|
"items": [
|
|
{"column_names": ["Name", "Age", "City", "Birth"], "sheet_name": "Sheet1"}
|
|
]
|
|
}
|
|
file.close()
|
|
|
|
|
|
def test_excel_reader_metadata_mul_sheets():
|
|
buffer = io.BytesIO()
|
|
workbook = xlsxwriter.Workbook(buffer)
|
|
|
|
worksheet1 = workbook.add_worksheet("Sheet1")
|
|
header1 = ["col11", "col12"]
|
|
data1 = [["v11", "v12"]]
|
|
write_data_to_worksheet(worksheet1, header1, data1)
|
|
|
|
worksheet2 = workbook.add_worksheet("Sheet2")
|
|
header2 = ["col21", "col22"]
|
|
data2 = [["v21", "v22"]]
|
|
write_data_to_worksheet(worksheet2, header2, data2)
|
|
workbook.close()
|
|
|
|
file = FileStorage(stream=buffer, filename="test.xls")
|
|
|
|
excel_reader = ExcelReader(
|
|
options=ExcelReaderOptions(),
|
|
)
|
|
metadata = excel_reader.file_metadata(file)
|
|
assert metadata == {
|
|
"items": [
|
|
{"column_names": ["col11", "col12"], "sheet_name": "Sheet1"},
|
|
{"column_names": ["col21", "col22"], "sheet_name": "Sheet2"},
|
|
]
|
|
}
|
|
file.close()
|
|
|
|
|
|
def test_excel_reader_file_metadata_invalid_file():
|
|
excel_reader = ExcelReader(
|
|
options=ExcelReaderOptions(),
|
|
)
|
|
with pytest.raises(DatabaseUploadFailed) as ex:
|
|
excel_reader.file_metadata(FileStorage(io.BytesIO(b"1")))
|
|
assert str(ex.value) == ("Excel file format cannot be determined")
|