fix(#13734): Properly escape special characters in CSV output (#13735)

* fix: Escape csv content during downloads * Reuse CsvResponse object * Use correct mimetype for csv responses * Ensure that headers are also escaped * Update escaping logic
2021-03-26 15:22:00 -07:00 · 2021-03-26 15:22:00 -07:00 · 55ba47ec2e
parent 5842cb1b12
commit 55ba47ec2e
7 changed files with 160 additions and 21 deletions
--- a/superset/charts/api.py
+++ b/superset/charts/api.py
@ -484,12 +484,7 @@ class ChartRestApi(BaseSupersetModelRestApi):
        if result_format == ChartDataResultFormat.CSV:
            # return the first result
            data = result["queries"][0]["data"]
-            return CsvResponse(
-                data,
-                status=200,
-                headers=generate_download_headers("csv"),
-                mimetype="application/csv",
-            )
+            return CsvResponse(data, headers=generate_download_headers("csv"))

        if result_format == ChartDataResultFormat.JSON:
            response_data = simplejson.dumps(
--- a/superset/common/query_context.py
+++ b/superset/common/query_context.py
@ -35,6 +35,7 @@ from superset.exceptions import (
 )
 from superset.extensions import cache_manager, security_manager
 from superset.stats_logger import BaseStatsLogger
+from superset.utils import csv
 from superset.utils.cache import generate_cache_key, set_and_log_cache
 from superset.utils.core import (
    ChartDataResultFormat,
@ -151,7 +152,9 @@ class QueryContext:
    def get_data(self, df: pd.DataFrame,) -> Union[str, List[Dict[str, Any]]]:
        if self.result_format == ChartDataResultFormat.CSV:
            include_index = not isinstance(df.index, pd.RangeIndex)
-            result = df.to_csv(index=include_index, **config["CSV_EXPORT"])
+            result = csv.df_to_escaped_csv(
+                df, index=include_index, **config["CSV_EXPORT"]
+            )
            return result or ""

        return df.to_dict(orient="records")
--- a/superset/utils/csv.py
+++ b/superset/utils/csv.py
@ -0,0 +1,67 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import re
+from typing import Any
+
+import pandas as pd
+
+negative_number_re = re.compile(r"^-[0-9.]+$")
+
+# This regex will match if the string starts with:
+#
+#     1. one of -, @, +, |, =, %
+#     2. two double quotes immediately followed by one of -, @, +, |, =, %
+#     3. one or more spaces immediately followed by one of -, @, +, |, =, %
+#
+problematic_chars_re = re.compile(r'^(?:"{2}|\s{1,})(?=[\-@+|=%])|^[\-@+|=%]')
+
+
+def escape_value(value: str) -> str:
+    """
+    Escapes a set of special characters.
+
+    http://georgemauer.net/2017/10/07/csv-injection.html
+    """
+    needs_escaping = problematic_chars_re.match(value) is not None
+    is_negative_number = negative_number_re.match(value) is not None
+
+    if needs_escaping and not is_negative_number:
+        # Escape pipe to be extra safe as this
+        # can lead to remote code execution
+        value = value.replace("|", "\\|")
+
+        # Precede the line with a single quote. This prevents
+        # evaluation of commands and some spreadsheet software
+        # will hide this visually from the user. Many articles
+        # claim a preceding space will work here too, however,
+        # when uploading a csv file in Google sheets, a leading
+        # space was ignored and code was still evaluated.
+        value = "'" + value
+
+    return value
+
+
+def df_to_escaped_csv(df: pd.DataFrame, **kwargs: Any) -> Any:
+    escape_values = lambda v: escape_value(v) if isinstance(v, str) else v
+
+    # Escape csv headers
+    df = df.rename(columns=escape_values)
+
+    # Escape csv rows
+    df = df.applymap(escape_values)
+
+    return df.to_csv(**kwargs)
--- a/superset/views/base.py
+++ b/superset/views/base.py
@ -479,6 +479,7 @@ class CsvResponse(Response):  # pylint: disable=too-many-ancestors
    """

    charset = conf["CSV_EXPORT"].get("encoding", "utf-8")
+    default_mimetype = "text/csv"


 def check_ownership(obj: Any, raise_if_false: bool = True) -> bool:
--- a/superset/views/core.py
+++ b/superset/views/core.py
@ -98,7 +98,7 @@ from superset.sql_parse import CtasMethod, ParsedQuery, Table
 from superset.sql_validators import get_validator_by_name
 from superset.tasks.async_queries import load_explore_json_into_cache
 from superset.typing import FlaskResponse
-from superset.utils import core as utils
+from superset.utils import core as utils, csv
 from superset.utils.async_query_manager import AsyncQueryTokenException
 from superset.utils.cache import etag_cache
 from superset.utils.core import ReservedUrlParameters
@ -437,10 +437,7 @@ class Superset(BaseSupersetView):  # pylint: disable=too-many-public-methods
    ) -> FlaskResponse:
        if response_type == utils.ChartDataResultFormat.CSV:
            return CsvResponse(
-                viz_obj.get_csv(),
-                status=200,
-                headers=generate_download_headers("csv"),
-                mimetype="application/csv",
+                viz_obj.get_csv(), headers=generate_download_headers("csv")
            )

        if response_type == utils.ChartDataResultType.QUERY:
@ -2628,18 +2625,14 @@ class Superset(BaseSupersetView):  # pylint: disable=too-many-public-methods
            columns = [c["name"] for c in obj["columns"]]
            df = pd.DataFrame.from_records(obj["data"], columns=columns)
            logger.info("Using pandas to convert to CSV")
-            csv = df.to_csv(index=False, **config["CSV_EXPORT"])
        else:
            logger.info("Running a query to turn into CSV")
            sql = query.select_sql or query.executed_sql
            df = query.database.get_df(sql, query.schema)
-            # TODO(bkyryliuk): add compression=gzip for big files.
-            csv = df.to_csv(index=False, **config["CSV_EXPORT"])
-        response = Response(csv, mimetype="text/csv")
+        csv_data = csv.df_to_escaped_csv(df, index=False, **config["CSV_EXPORT"])
        quoted_csv_name = parse.quote(query.name)
-        response.headers["Content-Disposition"] = (
-            f'attachment; filename="{quoted_csv_name}.csv"; '
-            f"filename*=UTF-8''{quoted_csv_name}.csv"
+        response = CsvResponse(
+            csv_data, headers=generate_download_headers("csv", quoted_csv_name)
        )
        event_info = {
            "event_type": "data_export",
--- a/superset/viz.py
+++ b/superset/viz.py
@ -66,7 +66,7 @@ from superset.extensions import cache_manager, security_manager
 from superset.models.cache import CacheKey
 from superset.models.helpers import QueryResult
 from superset.typing import QueryObjectDict, VizData, VizPayload
-from superset.utils import core as utils
+from superset.utils import core as utils, csv
 from superset.utils.cache import set_and_log_cache
 from superset.utils.core import (
    DTTM_ALIAS,
@ -615,7 +615,7 @@ class BaseViz:
    def get_csv(self) -> Optional[str]:
        df = self.get_df_payload()["df"]  # leverage caching logic
        include_index = not isinstance(df.index, pd.RangeIndex)
-        return df.to_csv(index=include_index, **config["CSV_EXPORT"])
+        return csv.df_to_escaped_csv(df, index=include_index, **config["CSV_EXPORT"])

    def get_data(self, df: pd.DataFrame) -> VizData:
        return df.to_dict(orient="records")
--- a/tests/utils/csv_tests.py
+++ b/tests/utils/csv_tests.py
@ -0,0 +1,80 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=no-self-use
+import io
+
+import pandas as pd
+import pytest
+
+from superset.utils import csv
+
+
+def test_escape_value():
+    result = csv.escape_value("value")
+    assert result == "value"
+
+    result = csv.escape_value("-10")
+    assert result == "-10"
+
+    result = csv.escape_value("@value")
+    assert result == "'@value"
+
+    result = csv.escape_value("+value")
+    assert result == "'+value"
+
+    result = csv.escape_value("-value")
+    assert result == "'-value"
+
+    result = csv.escape_value("=value")
+    assert result == "'=value"
+
+    result = csv.escape_value("|value")
+    assert result == "'\|value"
+
+    result = csv.escape_value("%value")
+    assert result == "'%value"
+
+    result = csv.escape_value("=cmd|' /C calc'!A0")
+    assert result == "'=cmd\|' /C calc'!A0"
+
+    result = csv.escape_value('""=10+2')
+    assert result == '\'""=10+2'
+
+    result = csv.escape_value(" =10+2")
+    assert result == "' =10+2"
+
+
+def test_df_to_escaped_csv():
+    csv_rows = [
+        ["col_a", "=func()"],
+        ["-10", "=cmd|' /C calc'!A0"],
+        ["a", '""=b'],
+        [" =a", "b"],
+    ]
+    csv_str = "\n".join([",".join(row) for row in csv_rows])
+
+    df = pd.read_csv(io.StringIO(csv_str))
+
+    escaped_csv_str = csv.df_to_escaped_csv(df, encoding="utf8", index=False)
+    escaped_csv_rows = [row.split(",") for row in escaped_csv_str.strip().split("\n")]
+
+    assert escaped_csv_rows == [
+        ["col_a", "'=func()"],
+        ["-10", "'=cmd\|' /C calc'!A0"],
+        ["a", "'=b"],  # pandas seems to be removing the leading ""
+        ["' =a", "b"],
+    ]