* fix: Escape csv content during downloads * Reuse CsvResponse object * Use correct mimetype for csv responses * Ensure that headers are also escaped * Update escaping logic
This commit is contained in:
parent
5842cb1b12
commit
55ba47ec2e
|
|
@ -484,12 +484,7 @@ class ChartRestApi(BaseSupersetModelRestApi):
|
|||
if result_format == ChartDataResultFormat.CSV:
|
||||
# return the first result
|
||||
data = result["queries"][0]["data"]
|
||||
return CsvResponse(
|
||||
data,
|
||||
status=200,
|
||||
headers=generate_download_headers("csv"),
|
||||
mimetype="application/csv",
|
||||
)
|
||||
return CsvResponse(data, headers=generate_download_headers("csv"))
|
||||
|
||||
if result_format == ChartDataResultFormat.JSON:
|
||||
response_data = simplejson.dumps(
|
||||
|
|
|
|||
|
|
@ -35,6 +35,7 @@ from superset.exceptions import (
|
|||
)
|
||||
from superset.extensions import cache_manager, security_manager
|
||||
from superset.stats_logger import BaseStatsLogger
|
||||
from superset.utils import csv
|
||||
from superset.utils.cache import generate_cache_key, set_and_log_cache
|
||||
from superset.utils.core import (
|
||||
ChartDataResultFormat,
|
||||
|
|
@ -151,7 +152,9 @@ class QueryContext:
|
|||
def get_data(self, df: pd.DataFrame,) -> Union[str, List[Dict[str, Any]]]:
|
||||
if self.result_format == ChartDataResultFormat.CSV:
|
||||
include_index = not isinstance(df.index, pd.RangeIndex)
|
||||
result = df.to_csv(index=include_index, **config["CSV_EXPORT"])
|
||||
result = csv.df_to_escaped_csv(
|
||||
df, index=include_index, **config["CSV_EXPORT"]
|
||||
)
|
||||
return result or ""
|
||||
|
||||
return df.to_dict(orient="records")
|
||||
|
|
|
|||
|
|
@ -0,0 +1,67 @@
|
|||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
import re
|
||||
from typing import Any
|
||||
|
||||
import pandas as pd
|
||||
|
||||
negative_number_re = re.compile(r"^-[0-9.]+$")
|
||||
|
||||
# This regex will match if the string starts with:
|
||||
#
|
||||
# 1. one of -, @, +, |, =, %
|
||||
# 2. two double quotes immediately followed by one of -, @, +, |, =, %
|
||||
# 3. one or more spaces immediately followed by one of -, @, +, |, =, %
|
||||
#
|
||||
problematic_chars_re = re.compile(r'^(?:"{2}|\s{1,})(?=[\-@+|=%])|^[\-@+|=%]')
|
||||
|
||||
|
||||
def escape_value(value: str) -> str:
|
||||
"""
|
||||
Escapes a set of special characters.
|
||||
|
||||
http://georgemauer.net/2017/10/07/csv-injection.html
|
||||
"""
|
||||
needs_escaping = problematic_chars_re.match(value) is not None
|
||||
is_negative_number = negative_number_re.match(value) is not None
|
||||
|
||||
if needs_escaping and not is_negative_number:
|
||||
# Escape pipe to be extra safe as this
|
||||
# can lead to remote code execution
|
||||
value = value.replace("|", "\\|")
|
||||
|
||||
# Precede the line with a single quote. This prevents
|
||||
# evaluation of commands and some spreadsheet software
|
||||
# will hide this visually from the user. Many articles
|
||||
# claim a preceding space will work here too, however,
|
||||
# when uploading a csv file in Google sheets, a leading
|
||||
# space was ignored and code was still evaluated.
|
||||
value = "'" + value
|
||||
|
||||
return value
|
||||
|
||||
|
||||
def df_to_escaped_csv(df: pd.DataFrame, **kwargs: Any) -> Any:
|
||||
escape_values = lambda v: escape_value(v) if isinstance(v, str) else v
|
||||
|
||||
# Escape csv headers
|
||||
df = df.rename(columns=escape_values)
|
||||
|
||||
# Escape csv rows
|
||||
df = df.applymap(escape_values)
|
||||
|
||||
return df.to_csv(**kwargs)
|
||||
|
|
@ -479,6 +479,7 @@ class CsvResponse(Response): # pylint: disable=too-many-ancestors
|
|||
"""
|
||||
|
||||
charset = conf["CSV_EXPORT"].get("encoding", "utf-8")
|
||||
default_mimetype = "text/csv"
|
||||
|
||||
|
||||
def check_ownership(obj: Any, raise_if_false: bool = True) -> bool:
|
||||
|
|
|
|||
|
|
@ -98,7 +98,7 @@ from superset.sql_parse import CtasMethod, ParsedQuery, Table
|
|||
from superset.sql_validators import get_validator_by_name
|
||||
from superset.tasks.async_queries import load_explore_json_into_cache
|
||||
from superset.typing import FlaskResponse
|
||||
from superset.utils import core as utils
|
||||
from superset.utils import core as utils, csv
|
||||
from superset.utils.async_query_manager import AsyncQueryTokenException
|
||||
from superset.utils.cache import etag_cache
|
||||
from superset.utils.core import ReservedUrlParameters
|
||||
|
|
@ -437,10 +437,7 @@ class Superset(BaseSupersetView): # pylint: disable=too-many-public-methods
|
|||
) -> FlaskResponse:
|
||||
if response_type == utils.ChartDataResultFormat.CSV:
|
||||
return CsvResponse(
|
||||
viz_obj.get_csv(),
|
||||
status=200,
|
||||
headers=generate_download_headers("csv"),
|
||||
mimetype="application/csv",
|
||||
viz_obj.get_csv(), headers=generate_download_headers("csv")
|
||||
)
|
||||
|
||||
if response_type == utils.ChartDataResultType.QUERY:
|
||||
|
|
@ -2628,18 +2625,14 @@ class Superset(BaseSupersetView): # pylint: disable=too-many-public-methods
|
|||
columns = [c["name"] for c in obj["columns"]]
|
||||
df = pd.DataFrame.from_records(obj["data"], columns=columns)
|
||||
logger.info("Using pandas to convert to CSV")
|
||||
csv = df.to_csv(index=False, **config["CSV_EXPORT"])
|
||||
else:
|
||||
logger.info("Running a query to turn into CSV")
|
||||
sql = query.select_sql or query.executed_sql
|
||||
df = query.database.get_df(sql, query.schema)
|
||||
# TODO(bkyryliuk): add compression=gzip for big files.
|
||||
csv = df.to_csv(index=False, **config["CSV_EXPORT"])
|
||||
response = Response(csv, mimetype="text/csv")
|
||||
csv_data = csv.df_to_escaped_csv(df, index=False, **config["CSV_EXPORT"])
|
||||
quoted_csv_name = parse.quote(query.name)
|
||||
response.headers["Content-Disposition"] = (
|
||||
f'attachment; filename="{quoted_csv_name}.csv"; '
|
||||
f"filename*=UTF-8''{quoted_csv_name}.csv"
|
||||
response = CsvResponse(
|
||||
csv_data, headers=generate_download_headers("csv", quoted_csv_name)
|
||||
)
|
||||
event_info = {
|
||||
"event_type": "data_export",
|
||||
|
|
|
|||
|
|
@ -66,7 +66,7 @@ from superset.extensions import cache_manager, security_manager
|
|||
from superset.models.cache import CacheKey
|
||||
from superset.models.helpers import QueryResult
|
||||
from superset.typing import QueryObjectDict, VizData, VizPayload
|
||||
from superset.utils import core as utils
|
||||
from superset.utils import core as utils, csv
|
||||
from superset.utils.cache import set_and_log_cache
|
||||
from superset.utils.core import (
|
||||
DTTM_ALIAS,
|
||||
|
|
@ -615,7 +615,7 @@ class BaseViz:
|
|||
def get_csv(self) -> Optional[str]:
|
||||
df = self.get_df_payload()["df"] # leverage caching logic
|
||||
include_index = not isinstance(df.index, pd.RangeIndex)
|
||||
return df.to_csv(index=include_index, **config["CSV_EXPORT"])
|
||||
return csv.df_to_escaped_csv(df, index=include_index, **config["CSV_EXPORT"])
|
||||
|
||||
def get_data(self, df: pd.DataFrame) -> VizData:
|
||||
return df.to_dict(orient="records")
|
||||
|
|
|
|||
|
|
@ -0,0 +1,80 @@
|
|||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
# pylint: disable=no-self-use
|
||||
import io
|
||||
|
||||
import pandas as pd
|
||||
import pytest
|
||||
|
||||
from superset.utils import csv
|
||||
|
||||
|
||||
def test_escape_value():
|
||||
result = csv.escape_value("value")
|
||||
assert result == "value"
|
||||
|
||||
result = csv.escape_value("-10")
|
||||
assert result == "-10"
|
||||
|
||||
result = csv.escape_value("@value")
|
||||
assert result == "'@value"
|
||||
|
||||
result = csv.escape_value("+value")
|
||||
assert result == "'+value"
|
||||
|
||||
result = csv.escape_value("-value")
|
||||
assert result == "'-value"
|
||||
|
||||
result = csv.escape_value("=value")
|
||||
assert result == "'=value"
|
||||
|
||||
result = csv.escape_value("|value")
|
||||
assert result == "'\|value"
|
||||
|
||||
result = csv.escape_value("%value")
|
||||
assert result == "'%value"
|
||||
|
||||
result = csv.escape_value("=cmd|' /C calc'!A0")
|
||||
assert result == "'=cmd\|' /C calc'!A0"
|
||||
|
||||
result = csv.escape_value('""=10+2')
|
||||
assert result == '\'""=10+2'
|
||||
|
||||
result = csv.escape_value(" =10+2")
|
||||
assert result == "' =10+2"
|
||||
|
||||
|
||||
def test_df_to_escaped_csv():
|
||||
csv_rows = [
|
||||
["col_a", "=func()"],
|
||||
["-10", "=cmd|' /C calc'!A0"],
|
||||
["a", '""=b'],
|
||||
[" =a", "b"],
|
||||
]
|
||||
csv_str = "\n".join([",".join(row) for row in csv_rows])
|
||||
|
||||
df = pd.read_csv(io.StringIO(csv_str))
|
||||
|
||||
escaped_csv_str = csv.df_to_escaped_csv(df, encoding="utf8", index=False)
|
||||
escaped_csv_rows = [row.split(",") for row in escaped_csv_str.strip().split("\n")]
|
||||
|
||||
assert escaped_csv_rows == [
|
||||
["col_a", "'=func()"],
|
||||
["-10", "'=cmd\|' /C calc'!A0"],
|
||||
["a", "'=b"], # pandas seems to be removing the leading ""
|
||||
["' =a", "b"],
|
||||
]
|
||||
Loading…
Reference in New Issue