fix: Histogram chart not able to use decimal datatype column (#30416)

2024-09-30 09:04:55 -03:00 · 2024-09-30 09:04:55 -03:00 · 4834390e6a
parent bdd50c7553
commit 4834390e6a
3 changed files with 19 additions and 24 deletions
--- a/superset-frontend/plugins/plugin-chart-echarts/src/Histogram/buildQuery.ts
+++ b/superset-frontend/plugins/plugin-chart-echarts/src/Histogram/buildQuery.ts
@ -25,7 +25,6 @@ export default function buildQuery(formData: HistogramFormData) {
  return buildQueryContext(formData, baseQueryObject => [
    {
      ...baseQueryObject,
-      extras: { where: `${column} IS NOT NULL` },
      columns: [...groupby, column],
      post_processing: [histogramOperator(formData, baseQueryObject)],
      metrics: undefined,
--- a/superset/utils/pandas_postprocessing/histogram.py
+++ b/superset/utils/pandas_postprocessing/histogram.py
@ -17,7 +17,7 @@
 from __future__ import annotations

 import numpy as np
-from pandas import DataFrame, Series
+from pandas import DataFrame, Series, to_numeric


 # pylint: disable=too-many-arguments
@ -48,12 +48,15 @@ def histogram(
    if groupby is None:
        groupby = []

-    # check if the column is numeric
-    if not np.issubdtype(df[column].dtype, np.number):
-        raise ValueError(f"The column '{column}' must be numeric.")
+    # convert to numeric, coercing errors to NaN
+    df[column] = to_numeric(df[column], errors="coerce")
+
+    # check if the column contains non-numeric values
+    if df[column].isna().any():
+        raise ValueError(f"Column '{column}' contains non-numeric values")

    # calculate the histogram bin edges
-    bin_edges = np.histogram_bin_edges(df[column].dropna(), bins=bins)
+    bin_edges = np.histogram_bin_edges(df[column], bins=bins)

    # convert the bin edges to strings
    bin_edges_str = [
@ -62,6 +65,7 @@ def histogram(
    ]

    def hist_values(series: Series) -> np.ndarray:
+        # we might have NaN values as the result of grouping so we need to drop them
        result = np.histogram(series.dropna(), bins=bin_edges)[0]
        return result if not cumulative else np.cumsum(result)

--- a/tests/unit_tests/pandas_postprocessing/test_histogram.py
+++ b/tests/unit_tests/pandas_postprocessing/test_histogram.py
@ -117,28 +117,20 @@ def test_histogram_with_groupby_and_cumulative_and_normalize():

 def test_histogram_with_non_numeric_column():
    try:
-        histogram(data, "b", ["group"], bins)
+        histogram(data, "group", None, bins)
    except ValueError as e:
-        assert str(e) == "The column 'b' must be numeric."
+        assert str(e) == "Column 'group' contains non-numeric values"


-# test histogram ignore null values
-def test_histogram_ignore_null_values():
-    data_with_null = DataFrame(
+def test_histogram_with_some_non_numeric_values():
+    data_with_non_numeric = DataFrame(
        {
            "group": ["A", "A", "B", "B", "A", "A", "B", "B", "A", "A"],
-            "a": [1, 2, 3, 4, 5, 6, 7, 8, 9, None],
-            "b": [1, 2, 3, 4, 5, 6, 7, 8, 9, None],
+            "a": [1, 2, 3, 4, 5, 6, 7, 8, 9, "10"],
+            "b": [1, 2, 3, 4, 5, 6, 7, 8, 9, "10"],
        }
    )
-    result = histogram(data_with_null, "a", ["group"], bins)
-    assert result.shape == (2, bins + 1)
-    assert result.columns.tolist() == [
-        "group",
-        "1 - 2",
-        "2 - 4",
-        "4 - 5",
-        "5 - 7",
-        "7 - 9",
-    ]
-    assert result.values.tolist() == [["A", 2, 0, 1, 1, 1], ["B", 0, 2, 0, 1, 1]]
+    try:
+        histogram(data_with_non_numeric, "a", ["group"], bins)
+    except ValueError as e:
+        assert str(e) == "Column 'group' contains non-numeric values"