feat: support nulls in the csv uploads (#10208)

* Support more table properties for the hive upload Refactor Add tests, and refactor them to be pytest friendly Use lowercase table names Ignore isort * Use sql params Co-authored-by: bogdan kyryliuk <bogdankyryliuk@dropbox.com>
2020-07-06 13:26:43 -07:00 · 2020-07-06 13:26:43 -07:00 · 84f8a51458
parent 318e5347bc
commit 84f8a51458
10 changed files with 325 additions and 160 deletions
--- a/pytest.ini
+++ b/pytest.ini
@ -15,7 +15,6 @@
 # limitations under the License.
 #
 [pytest]
 addopts = -ra -q
 testpaths =
    tests
 python_files = *_test.py test_*.py *_tests.py
--- a/superset/config.py
+++ b/superset/config.py
@ -35,6 +35,7 @@ from celery.schedules import crontab
 from dateutil import tz
 from flask import Blueprint
 from flask_appbuilder.security.manager import AUTH_DB
 from pandas.io.parsers import STR_NA_VALUES
 from superset.jinja_context import (  # pylint: disable=unused-import
    BaseTemplateProcessor,
@ -622,6 +623,9 @@ ALLOWED_USER_CSV_SCHEMA_FUNC: Callable[
    UPLOADED_CSV_HIVE_NAMESPACE
 ] if UPLOADED_CSV_HIVE_NAMESPACE else []
 # Values that should be treated as nulls for the csv uploads.
 CSV_DEFAULT_NA_NAMES = list(STR_NA_VALUES)
 # A dictionary of items that gets merged into the Jinja context for
 # SQL Lab. The existing context gets updated with this dictionary,
 # meaning values for existing keys get overwritten by the content of this
--- a/superset/db_engine_specs/hive.py
+++ b/superset/db_engine_specs/hive.py
@ -106,6 +106,45 @@ class HiveEngineSpec(PrestoEngineSpec):
        except pyhive.exc.ProgrammingError:
            return []
    @classmethod
    def get_create_table_stmt(  # pylint: disable=too-many-arguments
        cls,
        table: Table,
        schema_definition: str,
        location: str,
        delim: str,
        header_line_count: Optional[int],
        null_values: Optional[List[str]],
    ) -> text:
        tblproperties = []
        # available options:
        # https://cwiki.apache.org/confluence/display/Hive/LanguageManual+DDL
        # TODO(bkyryliuk): figure out what to do with the skip rows field.
        params: Dict[str, str] = {
            "delim": delim,
            "location": location,
        }
        if header_line_count is not None and header_line_count >= 0:
            header_line_count += 1
            tblproperties.append("'skip.header.line.count'=':header_line_count'")
            params["header_line_count"] = str(header_line_count)
        if null_values:
            # hive only supports 1 value for the null format
            tblproperties.append("'serialization.null.format'=':null_value'")
            params["null_value"] = null_values[0]
        if tblproperties:
            tblproperties_stmt = f"tblproperties ({', '.join(tblproperties)})"
            sql = f"""CREATE TABLE {str(table)} ( {schema_definition} )
                ROW FORMAT DELIMITED FIELDS TERMINATED BY :delim
                STORED AS TEXTFILE LOCATION :location
                {tblproperties_stmt}"""
        else:
            sql = f"""CREATE TABLE {str(table)} ( {schema_definition} )
                ROW FORMAT DELIMITED FIELDS TERMINATED BY :delim
                STORED AS TEXTFILE LOCATION :location"""
        return sql, params
    @classmethod
    def create_table_from_csv(  # pylint: disable=too-many-arguments, too-many-locals
        cls,
@ -182,18 +221,17 @@ class HiveEngineSpec(PrestoEngineSpec):
            bucket_path,
            os.path.join(upload_prefix, table.table, os.path.basename(filename)),
        )
-        sql = text(
+
-            f"""CREATE TABLE {str(table)} ( {schema_definition} )
+        sql, params = cls.get_create_table_stmt(
-            ROW FORMAT DELIMITED FIELDS TERMINATED BY :delim
+            table,
-            STORED AS TEXTFILE LOCATION :location
+            schema_definition,
-            tblproperties ('skip.header.line.count'='1')"""
+            location,
            csv_to_df_kwargs["sep"].encode().decode("unicode_escape"),
            int(csv_to_df_kwargs.get("header", 0)),
            csv_to_df_kwargs.get("na_values"),
        )
        engine = cls.get_engine(database)
-        engine.execute(
+        engine.execute(text(sql), **params)
            sql,
            delim=csv_to_df_kwargs["sep"].encode().decode("unicode_escape"),
            location=location,
        )
    @classmethod
    def convert_dttm(cls, target_type: str, dttm: datetime) -> Optional[str]:
--- a/superset/forms.py
+++ b/superset/forms.py
@ -15,12 +15,27 @@
 # specific language governing permissions and limitations
 # under the License.
 """Contains the logic to create cohesive forms on the explore view"""
 import json
 from typing import Any, List, Optional
 from flask_appbuilder.fieldwidgets import BS3TextFieldWidget
 from wtforms import Field
 class JsonListField(Field):
    widget = BS3TextFieldWidget()
    data: List[str] = []
    def _value(self) -> str:
        return json.dumps(self.data)
    def process_formdata(self, valuelist: List[str]) -> None:
        if valuelist and valuelist[0]:
            self.data = json.loads(valuelist[0])
        else:
            self.data = []
 class CommaSeparatedListField(Field):
    widget = BS3TextFieldWidget()
    data: List[str] = []
--- a/superset/views/database/forms.py
+++ b/superset/views/database/forms.py
@ -26,7 +26,11 @@ from wtforms.ext.sqlalchemy.fields import QuerySelectField
 from wtforms.validators import DataRequired, Length, NumberRange, Optional
 from superset import app, db, security_manager
-from superset.forms import CommaSeparatedListField, filter_not_empty_values
+from superset.forms import (
    CommaSeparatedListField,
    filter_not_empty_values,
    JsonListField,
 )
 from superset.models.core import Database
 config = app.config
@ -210,6 +214,16 @@ class CsvToDatabaseForm(DynamicForm):
        validators=[Optional()],
        widget=BS3TextFieldWidget(),
    )
    null_values = JsonListField(
        _("Null values"),
        default=config["CSV_DEFAULT_NA_NAMES"],
        description=_(
            "Json list of the values that should be treated as null. "
            'Examples: [""], ["None", "N/A"], ["nan", "null"]. '
            "Warning: Hive database supports only single value. "
            'Use [""] for empty string.'
        ),
    )
 class ExcelToDatabaseForm(DynamicForm):
@ -376,3 +390,13 @@ class ExcelToDatabaseForm(DynamicForm):
        validators=[Optional()],
        widget=BS3TextFieldWidget(),
    )
    null_values = JsonListField(
        _("Null values"),
        default=config["CSV_DEFAULT_NA_NAMES"],
        description=_(
            "Json list of the values that should be treated as null. "
            'Examples: [""], ["None", "N/A"], ["nan", "null"]. '
            "Warning: Hive database supports only single value. "
            'Use [""] for empty string.'
        ),
    )
--- a/superset/views/database/views.py
+++ b/superset/views/database/views.py
@ -149,6 +149,9 @@ class CsvToDatabaseView(SimpleFormView):
            database = (
                db.session.query(models.Database).filter_by(id=con.data.get("id")).one()
            )
            # More can be found here:
            # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html
            csv_to_df_kwargs = {
                "sep": form.sep.data,
                "header": form.header.data if form.header.data else 0,
@ -162,6 +165,12 @@ class CsvToDatabaseView(SimpleFormView):
                "infer_datetime_format": form.infer_datetime_format.data,
                "chunksize": 1000,
            }
            if form.null_values.data:
                csv_to_df_kwargs["na_values"] = form.null_values.data
                csv_to_df_kwargs["keep_default_na"] = False
            # More can be found here:
            # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_sql.html
            df_to_sql_kwargs = {
                "name": csv_table.table,
                "if_exists": form.if_exists.data,
--- a/tests/base_tests.py
+++ b/tests/base_tests.py
@ -27,8 +27,8 @@ from flask_appbuilder.security.sqla import models as ab_models
 from flask_testing import TestCase
 from sqlalchemy.orm import Session
 from tests.test_app import app
 from superset.sql_parse import CtasMethod
 from tests.test_app import app  # isort:skip
 from superset import db, security_manager
 from superset.connectors.base.models import BaseDatasource
 from superset.connectors.druid.models import DruidCluster, DruidDatasource
--- a/tests/core_tests.py
+++ b/tests/core_tests.py
@ -916,12 +916,12 @@ class TestCore(SupersetTestCase):
    def test_import_csv(self):
        self.login(username="admin")
-        table_name = "".join(random.choice(string.ascii_uppercase) for _ in range(5))
+        table_name = "".join(random.choice(string.ascii_lowercase) for _ in range(5))
        f1 = "testCSV.csv"
        self.create_sample_csvfile(f1, ["a,b", "john,1", "paul,2"])
        f2 = "testCSV2.csv"
-        self.create_sample_csvfile(f2, ["b,c,d", "john,1,x", "paul,2,y"])
+        self.create_sample_csvfile(f2, ["b,c,d", "john,1,x", "paul,2,"])
        self.enable_csv_upload(utils.get_example_database())
        try:
@ -957,6 +957,23 @@ class TestCore(SupersetTestCase):
            table = self.get_table_by_name(table_name)
            # make sure the new column name is reflected in the table metadata
            self.assertIn("d", table.column_names)
            # null values are set
            self.upload_csv(
                f2,
                table_name,
                extra={"null_values": '["", "john"]', "if_exists": "replace"},
            )
            # make sure that john and empty string are replaced with None
            data = db.session.execute(f"SELECT * from {table_name}").fetchall()
            assert data == [(None, 1, "x"), ("paul", 2, None)]
            # default null values
            self.upload_csv(f2, table_name, extra={"if_exists": "replace"})
            # make sure that john and empty string are replaced with None
            data = db.session.execute(f"SELECT * from {table_name}").fetchall()
            assert data == [("john", 1, "x"), ("paul", 2, None)]
        finally:
            os.remove(f1)
            os.remove(f2)
--- a/tests/db_engine_specs/base_tests.py
+++ b/tests/db_engine_specs/base_tests.py
@ -14,13 +14,13 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 # isort:skip_file
 from datetime import datetime
 from tests.test_app import app
 from tests.base_tests import SupersetTestCase
 from superset.db_engine_specs.mysql import MySQLEngineSpec
 from superset.models.core import Database
 from tests.base_tests import SupersetTestCase
 from tests.test_app import app  # isort:skip
 class TestDbEngineSpec(SupersetTestCase):
--- a/tests/db_engine_specs/hive_tests.py
+++ b/tests/db_engine_specs/hive_tests.py
@ -14,42 +14,48 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 # isort:skip_file
 from datetime import datetime
 from unittest import mock
 import pytest
 from tests.test_app import app
 from superset.db_engine_specs.hive import HiveEngineSpec
 from superset.exceptions import SupersetException
 from superset.sql_parse import Table
 from tests.db_engine_specs.base_tests import TestDbEngineSpec
-class TestHiveDbEngineSpec(TestDbEngineSpec):
+def test_0_progress():
    def test_0_progress(self):
    log = """
        17/02/07 18:26:27 INFO log.PerfLogger: <PERFLOG method=compile from=org.apache.hadoop.hive.ql.Driver>
        17/02/07 18:26:27 INFO log.PerfLogger: <PERFLOG method=parse from=org.apache.hadoop.hive.ql.Driver>
    """.split(
        "\n"
    )
-        self.assertEqual(0, HiveEngineSpec.progress(log))
+    assert HiveEngineSpec.progress(log) == 0
-    def test_number_of_jobs_progress(self):
+
 def test_number_of_jobs_progress():
    log = """
        17/02/07 19:15:55 INFO ql.Driver: Total jobs = 2
    """.split(
        "\n"
    )
-        self.assertEqual(0, HiveEngineSpec.progress(log))
+    assert HiveEngineSpec.progress(log) == 0
-    def test_job_1_launched_progress(self):
+
 def test_job_1_launched_progress():
    log = """
        17/02/07 19:15:55 INFO ql.Driver: Total jobs = 2
        17/02/07 19:15:55 INFO ql.Driver: Launching Job 1 out of 2
    """.split(
        "\n"
    )
-        self.assertEqual(0, HiveEngineSpec.progress(log))
+    assert HiveEngineSpec.progress(log) == 0
-    def test_job_1_launched_stage_1(self):
+
 def test_job_1_launched_stage_1():
    log = """
        17/02/07 19:15:55 INFO ql.Driver: Total jobs = 2
        17/02/07 19:15:55 INFO ql.Driver: Launching Job 1 out of 2
@ -57,11 +63,10 @@ class TestHiveDbEngineSpec(TestDbEngineSpec):
    """.split(
        "\n"
    )
-        self.assertEqual(0, HiveEngineSpec.progress(log))
+    assert HiveEngineSpec.progress(log) == 0
-    def test_job_1_launched_stage_1_map_40_progress(
+
-        self,
+def test_job_1_launched_stage_1_map_40_progress():  # pylint: disable=invalid-name
    ):  # pylint: disable=invalid-name
    log = """
        17/02/07 19:15:55 INFO ql.Driver: Total jobs = 2
        17/02/07 19:15:55 INFO ql.Driver: Launching Job 1 out of 2
@ -70,11 +75,10 @@ class TestHiveDbEngineSpec(TestDbEngineSpec):
    """.split(
        "\n"
    )
-        self.assertEqual(10, HiveEngineSpec.progress(log))
+    assert HiveEngineSpec.progress(log) == 10
-    def test_job_1_launched_stage_1_map_80_reduce_40_progress(
+
-        self,
+def test_job_1_launched_stage_1_map_80_reduce_40_progress():  # pylint: disable=invalid-name
    ):  # pylint: disable=invalid-name
    log = """
        17/02/07 19:15:55 INFO ql.Driver: Total jobs = 2
        17/02/07 19:15:55 INFO ql.Driver: Launching Job 1 out of 2
@ -84,11 +88,10 @@ class TestHiveDbEngineSpec(TestDbEngineSpec):
    """.split(
        "\n"
    )
-        self.assertEqual(30, HiveEngineSpec.progress(log))
+    assert HiveEngineSpec.progress(log) == 30
-    def test_job_1_launched_stage_2_stages_progress(
+
-        self,
+def test_job_1_launched_stage_2_stages_progress():  # pylint: disable=invalid-name
    ):  # pylint: disable=invalid-name
    log = """
        17/02/07 19:15:55 INFO ql.Driver: Total jobs = 2
        17/02/07 19:15:55 INFO ql.Driver: Launching Job 1 out of 2
@ -100,11 +103,10 @@ class TestHiveDbEngineSpec(TestDbEngineSpec):
    """.split(
        "\n"
    )
-        self.assertEqual(12, HiveEngineSpec.progress(log))
+    assert HiveEngineSpec.progress(log) == 12
-    def test_job_2_launched_stage_2_stages_progress(
+
-        self,
+def test_job_2_launched_stage_2_stages_progress():  # pylint: disable=invalid-name
    ):  # pylint: disable=invalid-name
    log = """
        17/02/07 19:15:55 INFO ql.Driver: Total jobs = 2
        17/02/07 19:15:55 INFO ql.Driver: Launching Job 1 out of 2
@ -115,63 +117,120 @@ class TestHiveDbEngineSpec(TestDbEngineSpec):
    """.split(
        "\n"
    )
-        self.assertEqual(60, HiveEngineSpec.progress(log))
+    assert HiveEngineSpec.progress(log) == 60
-    def test_hive_error_msg(self):
+
 def test_hive_error_msg():
    msg = (
        '{...} errorMessage="Error while compiling statement: FAILED: '
        "SemanticException [Error 10001]: Line 4"
        ":5 Table not found 'fact_ridesfdslakj'\", statusCode=3, "
        "sqlState='42S02', errorCode=10001)){...}"
    )
-        self.assertEqual(
+    assert HiveEngineSpec.extract_error_message(Exception(msg)) == (
            (
        "hive error: Error while compiling statement: FAILED: "
        "SemanticException [Error 10001]: Line 4:5 "
        "Table not found 'fact_ridesfdslakj'"
            ),
            HiveEngineSpec.extract_error_message(Exception(msg)),
    )
    e = Exception("Some string that doesn't match the regex")
-        self.assertEqual(f"hive error: {e}", HiveEngineSpec.extract_error_message(e))
+    assert HiveEngineSpec.extract_error_message(e) == f"hive error: {e}"
    msg = (
        "errorCode=10001, "
        'errorMessage="Error while compiling statement"), operationHandle'
        '=None)"'
    )
-        self.assertEqual(
+    assert (
-            ("hive error: Error while compiling statement"),
+        HiveEngineSpec.extract_error_message(Exception(msg))
-            HiveEngineSpec.extract_error_message(Exception(msg)),
+        == "hive error: Error while compiling statement"
    )
-    def test_hive_get_view_names_return_empty_list(
+
-        self,
+def test_hive_get_view_names_return_empty_list():  # pylint: disable=invalid-name
-    ):  # pylint: disable=invalid-name
+    assert HiveEngineSpec.get_view_names(mock.ANY, mock.ANY, mock.ANY) == []
-        self.assertEqual(
+
-            [], HiveEngineSpec.get_view_names(mock.ANY, mock.ANY, mock.ANY)
+
 def test_convert_dttm():
    dttm = datetime.strptime("2019-01-02 03:04:05.678900", "%Y-%m-%d %H:%M:%S.%f")
    assert HiveEngineSpec.convert_dttm("DATE", dttm) == "CAST('2019-01-02' AS DATE)"
    assert (
        HiveEngineSpec.convert_dttm("TIMESTAMP", dttm)
        == "CAST('2019-01-02 03:04:05.678900' AS TIMESTAMP)"
    )
    def test_convert_dttm(self):
        dttm = self.get_dttm()
-        self.assertEqual(
+def test_create_table_from_csv_append() -> None:
-            HiveEngineSpec.convert_dttm("DATE", dttm), "CAST('2019-01-02' AS DATE)"
+
    with pytest.raises(SupersetException):
        HiveEngineSpec.create_table_from_csv(
            "foo.csv", Table("foobar"), mock.MagicMock(), {}, {"if_exists": "append"}
        )
        self.assertEqual(
            HiveEngineSpec.convert_dttm("TIMESTAMP", dttm),
            "CAST('2019-01-02 03:04:05.678900' AS TIMESTAMP)",
        )
-    def test_create_table_from_csv_append(self) -> None:
+def test_get_create_table_stmt() -> None:
-        self.assertRaises(
+    table = Table("employee")
-            SupersetException,
+    schema_def = """eid int, name String, salary String, destination String"""
-            HiveEngineSpec.create_table_from_csv,
+    location = "s3a://directory/table"
-            "foo.csv",
+    from unittest import TestCase
-            Table("foobar"),
+
-            None,
+    TestCase.maxDiff = None
-            {},
+    assert HiveEngineSpec.get_create_table_stmt(
-            {"if_exists": "append"},
+        table, schema_def, location, ",", 0, [""]
    ) == (
        """CREATE TABLE employee ( eid int, name String, salary String, destination String )
                ROW FORMAT DELIMITED FIELDS TERMINATED BY :delim
                STORED AS TEXTFILE LOCATION :location
                tblproperties ('skip.header.line.count'=':header_line_count', 'serialization.null.format'=':null_value')""",
        {
            "delim": ",",
            "location": "s3a://directory/table",
            "header_line_count": "1",
            "null_value": "",
        },
    )
    assert HiveEngineSpec.get_create_table_stmt(
        table, schema_def, location, ",", 1, ["1", "2"]
    ) == (
        """CREATE TABLE employee ( eid int, name String, salary String, destination String )
                ROW FORMAT DELIMITED FIELDS TERMINATED BY :delim
                STORED AS TEXTFILE LOCATION :location
                tblproperties ('skip.header.line.count'=':header_line_count', 'serialization.null.format'=':null_value')""",
        {
            "delim": ",",
            "location": "s3a://directory/table",
            "header_line_count": "2",
            "null_value": "1",
        },
    )
    assert HiveEngineSpec.get_create_table_stmt(
        table, schema_def, location, ",", 100, ["NaN"]
    ) == (
        """CREATE TABLE employee ( eid int, name String, salary String, destination String )
                ROW FORMAT DELIMITED FIELDS TERMINATED BY :delim
                STORED AS TEXTFILE LOCATION :location
                tblproperties ('skip.header.line.count'=':header_line_count', 'serialization.null.format'=':null_value')""",
        {
            "delim": ",",
            "location": "s3a://directory/table",
            "header_line_count": "101",
            "null_value": "NaN",
        },
    )
    assert HiveEngineSpec.get_create_table_stmt(
        table, schema_def, location, ",", None, None
    ) == (
        """CREATE TABLE employee ( eid int, name String, salary String, destination String )
                ROW FORMAT DELIMITED FIELDS TERMINATED BY :delim
                STORED AS TEXTFILE LOCATION :location""",
        {"delim": ",", "location": "s3a://directory/table"},
    )
    assert HiveEngineSpec.get_create_table_stmt(
        table, schema_def, location, ",", 100, []
    ) == (
        """CREATE TABLE employee ( eid int, name String, salary String, destination String )
                ROW FORMAT DELIMITED FIELDS TERMINATED BY :delim
                STORED AS TEXTFILE LOCATION :location
                tblproperties ('skip.header.line.count'=':header_line_count')""",
        {"delim": ",", "location": "s3a://directory/table", "header_line_count": "101"},
    )