feat: linear imputation in Resample (#19393)

This commit is contained in:
Yongjie Zhao 2022-03-28 22:30:45 +08:00 committed by GitHub
parent 6b9113a17b
commit a39dd4493e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 56 additions and 0 deletions

View File

@ -170,6 +170,7 @@ export const advancedAnalyticsControls: ControlPanelSectionConfig = {
choices: [
['asfreq', 'Null imputation'],
['zerofill', 'Zero imputation'],
['linear', 'Linear interpolation'],
['ffill', 'Forward values'],
['bfill', 'Backward values'],
['median', 'Median values'],

View File

@ -20,6 +20,7 @@ import pandas as pd
from flask_babel import gettext as _
from superset.exceptions import InvalidPostProcessingError
from superset.utils.pandas_postprocessing.utils import RESAMPLE_METHOD
def resample(
@ -40,9 +41,15 @@ def resample(
"""
if not isinstance(df.index, pd.DatetimeIndex):
raise InvalidPostProcessingError(_("Resample operation requires DatetimeIndex"))
if method not in RESAMPLE_METHOD:
raise InvalidPostProcessingError(
_("Resample method should in ") + ", ".join(RESAMPLE_METHOD) + "."
)
if method == "asfreq" and fill_value is not None:
_df = df.resample(rule).asfreq(fill_value=fill_value)
elif method == "linear":
_df = df.resample(rule).interpolate()
else:
_df = getattr(df.resample(rule), method)()
return _df

View File

@ -92,6 +92,8 @@ PROPHET_TIME_GRAIN_MAP = {
"P1W/1970-01-04T00:00:00Z": "W",
}
RESAMPLE_METHOD = ("asfreq", "bfill", "ffill", "linear", "median", "mean", "sum")
FLAT_COLUMN_SEPARATOR = ", "

View File

@ -14,8 +14,10 @@
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import numpy as np
import pandas as pd
import pytest
from pandas import to_datetime
from superset.exceptions import InvalidPostProcessingError
from superset.utils import pandas_postprocessing as pp
@ -151,3 +153,47 @@ def test_resample_should_raise_ex():
pp.resample(
df=categories_df, rule="1D", method="asfreq",
)
with pytest.raises(InvalidPostProcessingError):
pp.resample(
df=timeseries_df, rule="1D", method="foobar",
)
def test_resample_linear():
df = pd.DataFrame(
index=to_datetime(["2019-01-01", "2019-01-05", "2019-01-08"]),
data={"label": ["a", "e", "j"], "y": [1.0, 5.0, 8.0]},
)
post_df = pp.resample(df=df, rule="1D", method="linear")
"""
label y
2019-01-01 a 1.0
2019-01-02 NaN 2.0
2019-01-03 NaN 3.0
2019-01-04 NaN 4.0
2019-01-05 e 5.0
2019-01-06 NaN 6.0
2019-01-07 NaN 7.0
2019-01-08 j 8.0
"""
assert post_df.equals(
pd.DataFrame(
index=pd.to_datetime(
[
"2019-01-01",
"2019-01-02",
"2019-01-03",
"2019-01-04",
"2019-01-05",
"2019-01-06",
"2019-01-07",
"2019-01-08",
]
),
data={
"label": ["a", np.NaN, np.NaN, np.NaN, "e", np.NaN, np.NaN, "j"],
"y": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0],
},
)
)