SQL Lab: Use numpy structured arrays, fallback to JSON serialization (#9096)

* Use numpy structured arrays, fallback to JSON serialization

* Explicitly cast data as list when creating numpy array
This commit is contained in:
Rob DiCiuccio 2020-02-06 17:25:22 -08:00 committed by GitHub
parent 8a138fbd03
commit 161d211c07
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 74 additions and 17 deletions

View File

@ -57,6 +57,15 @@ def dedup(l: List[str], suffix: str = "__", case_sensitive: bool = True) -> List
return new_l
def stringify(obj: Any) -> str:
return json.dumps(obj, default=utils.json_iso_dttm_ser)
def stringify_values(array: np.ndarray) -> np.ndarray:
vstringify: Callable = np.vectorize(stringify)
return vstringify(array)
class SupersetResultSet:
def __init__(
self,
@ -68,6 +77,8 @@ class SupersetResultSet:
column_names: List[str] = []
pa_data: List[pa.Array] = []
deduped_cursor_desc: List[Tuple[Any, ...]] = []
numpy_dtype: List[Tuple[str, ...]] = []
stringified_arr: np.ndarray
if cursor_description:
# get deduped list of column names
@ -79,33 +90,45 @@ class SupersetResultSet:
for column_name, description in zip(column_names, cursor_description)
]
# put data in a 2D array so we can efficiently access each column;
array = np.array(data, dtype="object")
if array.size > 0:
pa_data = [pa.array(array[:, i]) for i, column in enumerate(column_names)]
# generate numpy structured array dtype
numpy_dtype = [(column_name, "object") for column_name in column_names]
# put data in a structured array so we can efficiently access each column.
# cast `data` as list due to MySQL (others?) wrapping results with a tuple.
array = np.array(list(data), dtype=numpy_dtype)
if array.size > 0:
for column in column_names:
try:
pa_data.append(pa.array(array[column].tolist()))
except (
pa.lib.ArrowInvalid,
pa.lib.ArrowTypeError,
pa.lib.ArrowNotImplementedError,
):
# attempt serialization of values as strings
stringified_arr = stringify_values(array[column])
pa_data.append(pa.array(stringified_arr.tolist()))
# workaround for bug converting `psycopg2.tz.FixedOffsetTimezone` tzinfo values.
# related: https://issues.apache.org/jira/browse/ARROW-5248
if pa_data:
for i, column in enumerate(column_names):
# TODO: revisit nested column serialization once Arrow 1.0 is released with:
# https://github.com/apache/arrow/pull/6199
# Related issue: #8978
if pa.types.is_nested(pa_data[i].type):
stringify_func = lambda item: json.dumps(
item, default=utils.json_iso_dttm_ser
)
vfunc = np.vectorize(stringify_func)
strigified_arr = vfunc(array[:, i])
pa_data[i] = pa.array(strigified_arr)
# TODO: revisit nested column serialization once PyArrow updated with:
# https://github.com/apache/arrow/pull/6199
# Related issue: https://github.com/apache/incubator-superset/issues/8978
stringified_arr = stringify_values(array[column])
pa_data[i] = pa.array(stringified_arr.tolist())
elif pa.types.is_temporal(pa_data[i].type):
sample = self.first_nonempty(array[:, i])
# workaround for bug converting `psycopg2.tz.FixedOffsetTimezone` tzinfo values.
# related: https://issues.apache.org/jira/browse/ARROW-5248
sample = self.first_nonempty(array[column])
if sample and isinstance(sample, datetime.datetime):
try:
if sample.tzinfo:
tz = sample.tzinfo
series = pd.Series(array[:, i], dtype="datetime64[ns]")
series = pd.Series(
array[column], dtype="datetime64[ns]"
)
series = pd.to_datetime(series).dt.tz_localize(tz)
pa_data[i] = pa.Array.from_pandas(
series, type=pa.timestamp("ns", tz=tz)

View File

@ -166,6 +166,40 @@ class SupersetResultSetTestCase(SupersetTestCase):
],
)
def test_single_column_multidim_nested_types(self):
data = [
(
[
"test",
[
[
"foo",
123456,
[
[["test"], 3432546, 7657658766],
[["fake"], 656756765, 324324324324],
],
]
],
["test2", 43, 765765765],
None,
None,
],
)
]
cursor_descr = [("metadata",)]
results = SupersetResultSet(data, cursor_descr, BaseEngineSpec)
self.assertEqual(results.columns[0]["type"], "STRING")
df = results.to_pandas_df()
self.assertEqual(
df_to_records(df),
[
{
"metadata": '["test", [["foo", 123456, [[["test"], 3432546, 7657658766], [["fake"], 656756765, 324324324324]]]], ["test2", 43, 765765765], null, null]'
}
],
)
def test_empty_datetime(self):
data = [(None,)]
cursor_descr = [("ds", "timestamp", None, None, None, None, True)]