diff --git a/superset/charts/api.py b/superset/charts/api.py index 7c756f187..20ea943ee 100644 --- a/superset/charts/api.py +++ b/superset/charts/api.py @@ -484,12 +484,7 @@ class ChartRestApi(BaseSupersetModelRestApi): if result_format == ChartDataResultFormat.CSV: # return the first result data = result["queries"][0]["data"] - return CsvResponse( - data, - status=200, - headers=generate_download_headers("csv"), - mimetype="application/csv", - ) + return CsvResponse(data, headers=generate_download_headers("csv")) if result_format == ChartDataResultFormat.JSON: response_data = simplejson.dumps( diff --git a/superset/common/query_context.py b/superset/common/query_context.py index f381f4542..16520add7 100644 --- a/superset/common/query_context.py +++ b/superset/common/query_context.py @@ -35,6 +35,7 @@ from superset.exceptions import ( ) from superset.extensions import cache_manager, security_manager from superset.stats_logger import BaseStatsLogger +from superset.utils import csv from superset.utils.cache import generate_cache_key, set_and_log_cache from superset.utils.core import ( ChartDataResultFormat, @@ -151,7 +152,9 @@ class QueryContext: def get_data(self, df: pd.DataFrame,) -> Union[str, List[Dict[str, Any]]]: if self.result_format == ChartDataResultFormat.CSV: include_index = not isinstance(df.index, pd.RangeIndex) - result = df.to_csv(index=include_index, **config["CSV_EXPORT"]) + result = csv.df_to_escaped_csv( + df, index=include_index, **config["CSV_EXPORT"] + ) return result or "" return df.to_dict(orient="records") diff --git a/superset/utils/csv.py b/superset/utils/csv.py new file mode 100644 index 000000000..d91d427d5 --- /dev/null +++ b/superset/utils/csv.py @@ -0,0 +1,67 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +import re +from typing import Any + +import pandas as pd + +negative_number_re = re.compile(r"^-[0-9.]+$") + +# This regex will match if the string starts with: +# +# 1. one of -, @, +, |, =, % +# 2. two double quotes immediately followed by one of -, @, +, |, =, % +# 3. one or more spaces immediately followed by one of -, @, +, |, =, % +# +problematic_chars_re = re.compile(r'^(?:"{2}|\s{1,})(?=[\-@+|=%])|^[\-@+|=%]') + + +def escape_value(value: str) -> str: + """ + Escapes a set of special characters. + + http://georgemauer.net/2017/10/07/csv-injection.html + """ + needs_escaping = problematic_chars_re.match(value) is not None + is_negative_number = negative_number_re.match(value) is not None + + if needs_escaping and not is_negative_number: + # Escape pipe to be extra safe as this + # can lead to remote code execution + value = value.replace("|", "\\|") + + # Precede the line with a single quote. This prevents + # evaluation of commands and some spreadsheet software + # will hide this visually from the user. Many articles + # claim a preceding space will work here too, however, + # when uploading a csv file in Google sheets, a leading + # space was ignored and code was still evaluated. + value = "'" + value + + return value + + +def df_to_escaped_csv(df: pd.DataFrame, **kwargs: Any) -> Any: + escape_values = lambda v: escape_value(v) if isinstance(v, str) else v + + # Escape csv headers + df = df.rename(columns=escape_values) + + # Escape csv rows + df = df.applymap(escape_values) + + return df.to_csv(**kwargs) diff --git a/superset/views/base.py b/superset/views/base.py index cb486c513..ea80bb609 100644 --- a/superset/views/base.py +++ b/superset/views/base.py @@ -479,6 +479,7 @@ class CsvResponse(Response): # pylint: disable=too-many-ancestors """ charset = conf["CSV_EXPORT"].get("encoding", "utf-8") + default_mimetype = "text/csv" def check_ownership(obj: Any, raise_if_false: bool = True) -> bool: diff --git a/superset/views/core.py b/superset/views/core.py index 26fb72244..db5824720 100755 --- a/superset/views/core.py +++ b/superset/views/core.py @@ -98,7 +98,7 @@ from superset.sql_parse import CtasMethod, ParsedQuery, Table from superset.sql_validators import get_validator_by_name from superset.tasks.async_queries import load_explore_json_into_cache from superset.typing import FlaskResponse -from superset.utils import core as utils +from superset.utils import core as utils, csv from superset.utils.async_query_manager import AsyncQueryTokenException from superset.utils.cache import etag_cache from superset.utils.core import ReservedUrlParameters @@ -437,10 +437,7 @@ class Superset(BaseSupersetView): # pylint: disable=too-many-public-methods ) -> FlaskResponse: if response_type == utils.ChartDataResultFormat.CSV: return CsvResponse( - viz_obj.get_csv(), - status=200, - headers=generate_download_headers("csv"), - mimetype="application/csv", + viz_obj.get_csv(), headers=generate_download_headers("csv") ) if response_type == utils.ChartDataResultType.QUERY: @@ -2628,18 +2625,14 @@ class Superset(BaseSupersetView): # pylint: disable=too-many-public-methods columns = [c["name"] for c in obj["columns"]] df = pd.DataFrame.from_records(obj["data"], columns=columns) logger.info("Using pandas to convert to CSV") - csv = df.to_csv(index=False, **config["CSV_EXPORT"]) else: logger.info("Running a query to turn into CSV") sql = query.select_sql or query.executed_sql df = query.database.get_df(sql, query.schema) - # TODO(bkyryliuk): add compression=gzip for big files. - csv = df.to_csv(index=False, **config["CSV_EXPORT"]) - response = Response(csv, mimetype="text/csv") + csv_data = csv.df_to_escaped_csv(df, index=False, **config["CSV_EXPORT"]) quoted_csv_name = parse.quote(query.name) - response.headers["Content-Disposition"] = ( - f'attachment; filename="{quoted_csv_name}.csv"; ' - f"filename*=UTF-8''{quoted_csv_name}.csv" + response = CsvResponse( + csv_data, headers=generate_download_headers("csv", quoted_csv_name) ) event_info = { "event_type": "data_export", diff --git a/superset/viz.py b/superset/viz.py index dbdaf76e6..a9b4e9e42 100644 --- a/superset/viz.py +++ b/superset/viz.py @@ -66,7 +66,7 @@ from superset.extensions import cache_manager, security_manager from superset.models.cache import CacheKey from superset.models.helpers import QueryResult from superset.typing import QueryObjectDict, VizData, VizPayload -from superset.utils import core as utils +from superset.utils import core as utils, csv from superset.utils.cache import set_and_log_cache from superset.utils.core import ( DTTM_ALIAS, @@ -615,7 +615,7 @@ class BaseViz: def get_csv(self) -> Optional[str]: df = self.get_df_payload()["df"] # leverage caching logic include_index = not isinstance(df.index, pd.RangeIndex) - return df.to_csv(index=include_index, **config["CSV_EXPORT"]) + return csv.df_to_escaped_csv(df, index=include_index, **config["CSV_EXPORT"]) def get_data(self, df: pd.DataFrame) -> VizData: return df.to_dict(orient="records") diff --git a/tests/utils/csv_tests.py b/tests/utils/csv_tests.py new file mode 100644 index 000000000..e992fcb59 --- /dev/null +++ b/tests/utils/csv_tests.py @@ -0,0 +1,80 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=no-self-use +import io + +import pandas as pd +import pytest + +from superset.utils import csv + + +def test_escape_value(): + result = csv.escape_value("value") + assert result == "value" + + result = csv.escape_value("-10") + assert result == "-10" + + result = csv.escape_value("@value") + assert result == "'@value" + + result = csv.escape_value("+value") + assert result == "'+value" + + result = csv.escape_value("-value") + assert result == "'-value" + + result = csv.escape_value("=value") + assert result == "'=value" + + result = csv.escape_value("|value") + assert result == "'\|value" + + result = csv.escape_value("%value") + assert result == "'%value" + + result = csv.escape_value("=cmd|' /C calc'!A0") + assert result == "'=cmd\|' /C calc'!A0" + + result = csv.escape_value('""=10+2') + assert result == '\'""=10+2' + + result = csv.escape_value(" =10+2") + assert result == "' =10+2" + + +def test_df_to_escaped_csv(): + csv_rows = [ + ["col_a", "=func()"], + ["-10", "=cmd|' /C calc'!A0"], + ["a", '""=b'], + [" =a", "b"], + ] + csv_str = "\n".join([",".join(row) for row in csv_rows]) + + df = pd.read_csv(io.StringIO(csv_str)) + + escaped_csv_str = csv.df_to_escaped_csv(df, encoding="utf8", index=False) + escaped_csv_rows = [row.split(",") for row in escaped_csv_str.strip().split("\n")] + + assert escaped_csv_rows == [ + ["col_a", "'=func()"], + ["-10", "'=cmd\|' /C calc'!A0"], + ["a", "'=b"], # pandas seems to be removing the leading "" + ["' =a", "b"], + ]