fix(#13734): Properly escape special characters in CSV output (#13735)

* fix: Escape csv content during downloads

* Reuse CsvResponse object

* Use correct mimetype for csv responses

* Ensure that headers are also escaped

* Update escaping logic
This commit is contained in:
Ben Reinhart 2021-03-26 15:22:00 -07:00 committed by GitHub
parent 5842cb1b12
commit 55ba47ec2e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 160 additions and 21 deletions

View File

@ -484,12 +484,7 @@ class ChartRestApi(BaseSupersetModelRestApi):
if result_format == ChartDataResultFormat.CSV:
# return the first result
data = result["queries"][0]["data"]
return CsvResponse(
data,
status=200,
headers=generate_download_headers("csv"),
mimetype="application/csv",
)
return CsvResponse(data, headers=generate_download_headers("csv"))
if result_format == ChartDataResultFormat.JSON:
response_data = simplejson.dumps(

View File

@ -35,6 +35,7 @@ from superset.exceptions import (
)
from superset.extensions import cache_manager, security_manager
from superset.stats_logger import BaseStatsLogger
from superset.utils import csv
from superset.utils.cache import generate_cache_key, set_and_log_cache
from superset.utils.core import (
ChartDataResultFormat,
@ -151,7 +152,9 @@ class QueryContext:
def get_data(self, df: pd.DataFrame,) -> Union[str, List[Dict[str, Any]]]:
if self.result_format == ChartDataResultFormat.CSV:
include_index = not isinstance(df.index, pd.RangeIndex)
result = df.to_csv(index=include_index, **config["CSV_EXPORT"])
result = csv.df_to_escaped_csv(
df, index=include_index, **config["CSV_EXPORT"]
)
return result or ""
return df.to_dict(orient="records")

67
superset/utils/csv.py Normal file
View File

@ -0,0 +1,67 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import re
from typing import Any
import pandas as pd
negative_number_re = re.compile(r"^-[0-9.]+$")
# This regex will match if the string starts with:
#
# 1. one of -, @, +, |, =, %
# 2. two double quotes immediately followed by one of -, @, +, |, =, %
# 3. one or more spaces immediately followed by one of -, @, +, |, =, %
#
problematic_chars_re = re.compile(r'^(?:"{2}|\s{1,})(?=[\-@+|=%])|^[\-@+|=%]')
def escape_value(value: str) -> str:
"""
Escapes a set of special characters.
http://georgemauer.net/2017/10/07/csv-injection.html
"""
needs_escaping = problematic_chars_re.match(value) is not None
is_negative_number = negative_number_re.match(value) is not None
if needs_escaping and not is_negative_number:
# Escape pipe to be extra safe as this
# can lead to remote code execution
value = value.replace("|", "\\|")
# Precede the line with a single quote. This prevents
# evaluation of commands and some spreadsheet software
# will hide this visually from the user. Many articles
# claim a preceding space will work here too, however,
# when uploading a csv file in Google sheets, a leading
# space was ignored and code was still evaluated.
value = "'" + value
return value
def df_to_escaped_csv(df: pd.DataFrame, **kwargs: Any) -> Any:
escape_values = lambda v: escape_value(v) if isinstance(v, str) else v
# Escape csv headers
df = df.rename(columns=escape_values)
# Escape csv rows
df = df.applymap(escape_values)
return df.to_csv(**kwargs)

View File

@ -479,6 +479,7 @@ class CsvResponse(Response): # pylint: disable=too-many-ancestors
"""
charset = conf["CSV_EXPORT"].get("encoding", "utf-8")
default_mimetype = "text/csv"
def check_ownership(obj: Any, raise_if_false: bool = True) -> bool:

View File

@ -98,7 +98,7 @@ from superset.sql_parse import CtasMethod, ParsedQuery, Table
from superset.sql_validators import get_validator_by_name
from superset.tasks.async_queries import load_explore_json_into_cache
from superset.typing import FlaskResponse
from superset.utils import core as utils
from superset.utils import core as utils, csv
from superset.utils.async_query_manager import AsyncQueryTokenException
from superset.utils.cache import etag_cache
from superset.utils.core import ReservedUrlParameters
@ -437,10 +437,7 @@ class Superset(BaseSupersetView): # pylint: disable=too-many-public-methods
) -> FlaskResponse:
if response_type == utils.ChartDataResultFormat.CSV:
return CsvResponse(
viz_obj.get_csv(),
status=200,
headers=generate_download_headers("csv"),
mimetype="application/csv",
viz_obj.get_csv(), headers=generate_download_headers("csv")
)
if response_type == utils.ChartDataResultType.QUERY:
@ -2628,18 +2625,14 @@ class Superset(BaseSupersetView): # pylint: disable=too-many-public-methods
columns = [c["name"] for c in obj["columns"]]
df = pd.DataFrame.from_records(obj["data"], columns=columns)
logger.info("Using pandas to convert to CSV")
csv = df.to_csv(index=False, **config["CSV_EXPORT"])
else:
logger.info("Running a query to turn into CSV")
sql = query.select_sql or query.executed_sql
df = query.database.get_df(sql, query.schema)
# TODO(bkyryliuk): add compression=gzip for big files.
csv = df.to_csv(index=False, **config["CSV_EXPORT"])
response = Response(csv, mimetype="text/csv")
csv_data = csv.df_to_escaped_csv(df, index=False, **config["CSV_EXPORT"])
quoted_csv_name = parse.quote(query.name)
response.headers["Content-Disposition"] = (
f'attachment; filename="{quoted_csv_name}.csv"; '
f"filename*=UTF-8''{quoted_csv_name}.csv"
response = CsvResponse(
csv_data, headers=generate_download_headers("csv", quoted_csv_name)
)
event_info = {
"event_type": "data_export",

View File

@ -66,7 +66,7 @@ from superset.extensions import cache_manager, security_manager
from superset.models.cache import CacheKey
from superset.models.helpers import QueryResult
from superset.typing import QueryObjectDict, VizData, VizPayload
from superset.utils import core as utils
from superset.utils import core as utils, csv
from superset.utils.cache import set_and_log_cache
from superset.utils.core import (
DTTM_ALIAS,
@ -615,7 +615,7 @@ class BaseViz:
def get_csv(self) -> Optional[str]:
df = self.get_df_payload()["df"] # leverage caching logic
include_index = not isinstance(df.index, pd.RangeIndex)
return df.to_csv(index=include_index, **config["CSV_EXPORT"])
return csv.df_to_escaped_csv(df, index=include_index, **config["CSV_EXPORT"])
def get_data(self, df: pd.DataFrame) -> VizData:
return df.to_dict(orient="records")

80
tests/utils/csv_tests.py Normal file
View File

@ -0,0 +1,80 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# pylint: disable=no-self-use
import io
import pandas as pd
import pytest
from superset.utils import csv
def test_escape_value():
result = csv.escape_value("value")
assert result == "value"
result = csv.escape_value("-10")
assert result == "-10"
result = csv.escape_value("@value")
assert result == "'@value"
result = csv.escape_value("+value")
assert result == "'+value"
result = csv.escape_value("-value")
assert result == "'-value"
result = csv.escape_value("=value")
assert result == "'=value"
result = csv.escape_value("|value")
assert result == "'\|value"
result = csv.escape_value("%value")
assert result == "'%value"
result = csv.escape_value("=cmd|' /C calc'!A0")
assert result == "'=cmd\|' /C calc'!A0"
result = csv.escape_value('""=10+2')
assert result == '\'""=10+2'
result = csv.escape_value(" =10+2")
assert result == "' =10+2"
def test_df_to_escaped_csv():
csv_rows = [
["col_a", "=func()"],
["-10", "=cmd|' /C calc'!A0"],
["a", '""=b'],
[" =a", "b"],
]
csv_str = "\n".join([",".join(row) for row in csv_rows])
df = pd.read_csv(io.StringIO(csv_str))
escaped_csv_str = csv.df_to_escaped_csv(df, encoding="utf8", index=False)
escaped_csv_rows = [row.split(",") for row in escaped_csv_str.strip().split("\n")]
assert escaped_csv_rows == [
["col_a", "'=func()"],
["-10", "'=cmd\|' /C calc'!A0"],
["a", "'=b"], # pandas seems to be removing the leading ""
["' =a", "b"],
]