diff options
| author | sotech117 <michael_foiani@brown.edu> | 2025-07-31 17:27:24 -0400 |
|---|---|---|
| committer | sotech117 <michael_foiani@brown.edu> | 2025-07-31 17:27:24 -0400 |
| commit | 5bf22fc7e3c392c8bd44315ca2d06d7dca7d084e (patch) | |
| tree | 8dacb0f195df1c0788d36dd0064f6bbaa3143ede /venv/lib/python3.8/site-packages/narwhals/_pandas_like | |
| parent | b832d364da8c2efe09e3f75828caf73c50d01ce3 (diff) | |
add code for analysis of data
Diffstat (limited to 'venv/lib/python3.8/site-packages/narwhals/_pandas_like')
14 files changed, 4388 insertions, 0 deletions
diff --git a/venv/lib/python3.8/site-packages/narwhals/_pandas_like/__init__.py b/venv/lib/python3.8/site-packages/narwhals/_pandas_like/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/venv/lib/python3.8/site-packages/narwhals/_pandas_like/__init__.py diff --git a/venv/lib/python3.8/site-packages/narwhals/_pandas_like/dataframe.py b/venv/lib/python3.8/site-packages/narwhals/_pandas_like/dataframe.py new file mode 100644 index 0000000..bf5287f --- /dev/null +++ b/venv/lib/python3.8/site-packages/narwhals/_pandas_like/dataframe.py @@ -0,0 +1,1148 @@ +from __future__ import annotations + +from itertools import chain, product +from typing import ( + TYPE_CHECKING, + Any, + Callable, + Iterable, + Iterator, + Literal, + Mapping, + Sequence, + cast, + overload, +) + +import numpy as np + +from narwhals._compliant import EagerDataFrame +from narwhals._pandas_like.series import PANDAS_TO_NUMPY_DTYPE_MISSING, PandasLikeSeries +from narwhals._pandas_like.utils import ( + align_and_extract_native, + align_series_full_broadcast, + check_column_names_are_unique, + get_dtype_backend, + native_to_narwhals_dtype, + object_native_to_narwhals_dtype, + rename, + select_columns_by_name, + set_index, +) +from narwhals._utils import ( + Implementation, + _into_arrow_table, + _remap_full_join_keys, + exclude_column_names, + generate_temporary_column_name, + parse_columns_to_drop, + parse_version, + scale_bytes, + validate_backend_version, +) +from narwhals.dependencies import is_pandas_like_dataframe +from narwhals.exceptions import InvalidOperationError, ShapeError + +if TYPE_CHECKING: + from io import BytesIO + from pathlib import Path + from types import ModuleType + + import pandas as pd + import polars as pl + from typing_extensions import Self, TypeAlias, TypeIs + + from narwhals._compliant.typing import CompliantDataFrameAny, CompliantLazyFrameAny + from narwhals._pandas_like.expr import PandasLikeExpr + from narwhals._pandas_like.group_by import PandasLikeGroupBy + from narwhals._pandas_like.namespace import PandasLikeNamespace + from narwhals._translate import IntoArrowTable + from narwhals._utils import Version, _FullContext + from narwhals.dtypes import DType + from narwhals.schema import Schema + from narwhals.typing import ( + AsofJoinStrategy, + DTypeBackend, + JoinStrategy, + PivotAgg, + SizedMultiIndexSelector, + SizedMultiNameSelector, + SizeUnit, + UniqueKeepStrategy, + _2DArray, + _SliceIndex, + _SliceName, + ) + + Constructor: TypeAlias = Callable[..., pd.DataFrame] + + +CLASSICAL_NUMPY_DTYPES: frozenset[np.dtype[Any]] = frozenset( + [ + np.dtype("float64"), + np.dtype("float32"), + np.dtype("int64"), + np.dtype("int32"), + np.dtype("int16"), + np.dtype("int8"), + np.dtype("uint64"), + np.dtype("uint32"), + np.dtype("uint16"), + np.dtype("uint8"), + np.dtype("bool"), + np.dtype("datetime64[s]"), + np.dtype("datetime64[ms]"), + np.dtype("datetime64[us]"), + np.dtype("datetime64[ns]"), + np.dtype("timedelta64[s]"), + np.dtype("timedelta64[ms]"), + np.dtype("timedelta64[us]"), + np.dtype("timedelta64[ns]"), + np.dtype("object"), + ] +) + + +class PandasLikeDataFrame(EagerDataFrame["PandasLikeSeries", "PandasLikeExpr", "Any"]): + def __init__( + self, + native_dataframe: Any, + *, + implementation: Implementation, + backend_version: tuple[int, ...], + version: Version, + validate_column_names: bool, + ) -> None: + self._native_frame = native_dataframe + self._implementation = implementation + self._backend_version = backend_version + self._version = version + validate_backend_version(self._implementation, self._backend_version) + if validate_column_names: + check_column_names_are_unique(native_dataframe.columns) + + @classmethod + def from_arrow(cls, data: IntoArrowTable, /, *, context: _FullContext) -> Self: + implementation = context._implementation + tbl = _into_arrow_table(data, context) + if implementation.is_pandas(): + native = tbl.to_pandas() + elif implementation.is_modin(): # pragma: no cover + from modin.pandas.utils import ( + from_arrow as mpd_from_arrow, # pyright: ignore[reportAttributeAccessIssue] + ) + + native = mpd_from_arrow(tbl) + elif implementation.is_cudf(): # pragma: no cover + native = implementation.to_native_namespace().DataFrame.from_arrow(tbl) + else: # pragma: no cover + msg = "congratulations, you entered unreachable code - please report a bug" + raise AssertionError(msg) + return cls.from_native(native, context=context) + + @classmethod + def from_dict( + cls, + data: Mapping[str, Any], + /, + *, + context: _FullContext, + schema: Mapping[str, DType] | Schema | None, + ) -> Self: + from narwhals.schema import Schema + + implementation = context._implementation + ns = implementation.to_native_namespace() + Series = cast("type[pd.Series[Any]]", ns.Series) # noqa: N806 + DataFrame = cast("type[pd.DataFrame]", ns.DataFrame) # noqa: N806 + aligned_data: dict[str, pd.Series[Any] | Any] = {} + left_most: PandasLikeSeries | None = None + for name, series in data.items(): + if isinstance(series, Series): + compliant = PandasLikeSeries.from_native(series, context=context) + if left_most is None: + left_most = compliant + aligned_data[name] = series + else: + aligned_data[name] = align_and_extract_native(left_most, compliant)[1] + else: + aligned_data[name] = series + + native = DataFrame.from_dict(aligned_data) + if schema: + it: Iterable[DTypeBackend] = ( + get_dtype_backend(dtype, implementation) for dtype in native.dtypes + ) + native = native.astype(Schema(schema).to_pandas(it)) + return cls.from_native(native, context=context) + + @staticmethod + def _is_native(obj: Any) -> TypeIs[Any]: + return is_pandas_like_dataframe(obj) # pragma: no cover + + @classmethod + def from_native(cls, data: Any, /, *, context: _FullContext) -> Self: + return cls( + data, + implementation=context._implementation, + backend_version=context._backend_version, + version=context._version, + validate_column_names=True, + ) + + @classmethod + def from_numpy( + cls, + data: _2DArray, + /, + *, + context: _FullContext, + schema: Mapping[str, DType] | Schema | Sequence[str] | None, + ) -> Self: + from narwhals.schema import Schema + + implementation = context._implementation + DataFrame: Constructor = implementation.to_native_namespace().DataFrame # noqa: N806 + if isinstance(schema, (Mapping, Schema)): + it: Iterable[DTypeBackend] = ( + get_dtype_backend(native_type, implementation) + for native_type in schema.values() + ) + native = DataFrame(data, columns=schema.keys()).astype( + Schema(schema).to_pandas(it) + ) + else: + native = DataFrame(data, columns=cls._numpy_column_names(data, schema)) + return cls.from_native(native, context=context) + + def __narwhals_dataframe__(self) -> Self: + return self + + def __narwhals_lazyframe__(self) -> Self: + return self + + def __narwhals_namespace__(self) -> PandasLikeNamespace: + from narwhals._pandas_like.namespace import PandasLikeNamespace + + return PandasLikeNamespace( + self._implementation, self._backend_version, version=self._version + ) + + def __native_namespace__(self) -> ModuleType: + if self._implementation in { + Implementation.PANDAS, + Implementation.MODIN, + Implementation.CUDF, + }: + return self._implementation.to_native_namespace() + + msg = f"Expected pandas/modin/cudf, got: {type(self._implementation)}" # pragma: no cover + raise AssertionError(msg) + + def __len__(self) -> int: + return len(self.native) + + def _with_version(self, version: Version) -> Self: + return self.__class__( + self.native, + implementation=self._implementation, + backend_version=self._backend_version, + version=version, + validate_column_names=False, + ) + + def _with_native(self, df: Any, *, validate_column_names: bool = True) -> Self: + return self.__class__( + df, + implementation=self._implementation, + backend_version=self._backend_version, + version=self._version, + validate_column_names=validate_column_names, + ) + + def _extract_comparand(self, other: PandasLikeSeries) -> pd.Series[Any]: + index = self.native.index + if other._broadcast: + s = other.native + return type(s)(s.iloc[0], index=index, dtype=s.dtype, name=s.name) + if (len_other := len(other)) != (len_idx := len(index)): + msg = f"Expected object of length {len_idx}, got: {len_other}." + raise ShapeError(msg) + if other.native.index is not index: + return set_index( + other.native, + index, + implementation=other._implementation, + backend_version=other._backend_version, + ) + return other.native + + def get_column(self, name: str) -> PandasLikeSeries: + return PandasLikeSeries.from_native(self.native[name], context=self) + + def __array__(self, dtype: Any = None, *, copy: bool | None = None) -> _2DArray: + return self.to_numpy(dtype=dtype, copy=copy) + + def _gather(self, rows: SizedMultiIndexSelector[pd.Series[Any]]) -> Self: + items = list(rows) if isinstance(rows, tuple) else rows + return self._with_native(self.native.iloc[items, :]) + + def _gather_slice(self, rows: _SliceIndex | range) -> Self: + return self._with_native( + self.native.iloc[slice(rows.start, rows.stop, rows.step), :], + validate_column_names=False, + ) + + def _select_slice_name(self, columns: _SliceName) -> Self: + start = ( + self.native.columns.get_loc(columns.start) + if columns.start is not None + else None + ) + stop = ( + self.native.columns.get_loc(columns.stop) + 1 + if columns.stop is not None + else None + ) + selector = slice(start, stop, columns.step) + return self._with_native( + self.native.iloc[:, selector], validate_column_names=False + ) + + def _select_slice_index(self, columns: _SliceIndex | range) -> Self: + return self._with_native( + self.native.iloc[:, columns], validate_column_names=False + ) + + def _select_multi_index( + self, columns: SizedMultiIndexSelector[pd.Series[Any]] + ) -> Self: + columns = list(columns) if isinstance(columns, tuple) else columns + return self._with_native( + self.native.iloc[:, columns], validate_column_names=False + ) + + def _select_multi_name( + self, columns: SizedMultiNameSelector[pd.Series[Any]] + ) -> PandasLikeDataFrame: + return self._with_native(self.native.loc[:, columns]) + + # --- properties --- + @property + def columns(self) -> list[str]: + return self.native.columns.tolist() + + @overload + def rows(self, *, named: Literal[True]) -> list[dict[str, Any]]: ... + + @overload + def rows(self, *, named: Literal[False]) -> list[tuple[Any, ...]]: ... + + @overload + def rows(self, *, named: bool) -> list[tuple[Any, ...]] | list[dict[str, Any]]: ... + + def rows(self, *, named: bool) -> list[tuple[Any, ...]] | list[dict[str, Any]]: + if not named: + # cuDF does not support itertuples. But it does support to_dict! + if self._implementation is Implementation.CUDF: + # Extract the row values from the named rows + return [tuple(row.values()) for row in self.rows(named=True)] + + return list(self.native.itertuples(index=False, name=None)) + + return self.native.to_dict(orient="records") + + def iter_columns(self) -> Iterator[PandasLikeSeries]: + for _name, series in self.native.items(): # noqa: PERF102 + yield PandasLikeSeries.from_native(series, context=self) + + _iter_columns = iter_columns + + def iter_rows( + self, *, named: bool, buffer_size: int + ) -> Iterator[tuple[Any, ...]] | Iterator[dict[str, Any]]: + # The param ``buffer_size`` is only here for compatibility with the Polars API + # and has no effect on the output. + if not named: + yield from self.native.itertuples(index=False, name=None) + else: + col_names = self.native.columns + for row in self.native.itertuples(index=False): + yield dict(zip(col_names, row)) + + @property + def schema(self) -> dict[str, DType]: + native_dtypes = self.native.dtypes + return { + col: native_to_narwhals_dtype( + native_dtypes[col], self._version, self._implementation + ) + if native_dtypes[col] != "object" + else object_native_to_narwhals_dtype( + self.native[col], self._version, self._implementation + ) + for col in self.native.columns + } + + def collect_schema(self) -> dict[str, DType]: + return self.schema + + # --- reshape --- + def simple_select(self, *column_names: str) -> Self: + return self._with_native( + select_columns_by_name( + self.native, + list(column_names), + self._backend_version, + self._implementation, + ), + validate_column_names=False, + ) + + def select(self: PandasLikeDataFrame, *exprs: PandasLikeExpr) -> PandasLikeDataFrame: + new_series = self._evaluate_into_exprs(*exprs) + if not new_series: + # return empty dataframe, like Polars does + return self._with_native(self.native.__class__(), validate_column_names=False) + new_series = align_series_full_broadcast(*new_series) + namespace = self.__narwhals_namespace__() + df = namespace._concat_horizontal([s.native for s in new_series]) + # `concat` creates a new object, so fine to modify `.columns.name` inplace. + df.columns.name = self.native.columns.name + return self._with_native(df, validate_column_names=True) + + def drop_nulls( + self: PandasLikeDataFrame, subset: Sequence[str] | None + ) -> PandasLikeDataFrame: + if subset is None: + return self._with_native( + self.native.dropna(axis=0), validate_column_names=False + ) + plx = self.__narwhals_namespace__() + return self.filter(~plx.any_horizontal(plx.col(*subset).is_null())) + + def estimated_size(self, unit: SizeUnit) -> int | float: + sz = self.native.memory_usage(deep=True).sum() + return scale_bytes(sz, unit=unit) + + def with_row_index(self, name: str) -> Self: + frame = self.native + namespace = self.__narwhals_namespace__() + row_index = namespace._series.from_iterable( + range(len(frame)), context=self, index=frame.index + ).alias(name) + return self._with_native(namespace._concat_horizontal([row_index.native, frame])) + + def row(self, index: int) -> tuple[Any, ...]: + return tuple(x for x in self.native.iloc[index]) + + def filter( + self: PandasLikeDataFrame, predicate: PandasLikeExpr | list[bool] + ) -> PandasLikeDataFrame: + if isinstance(predicate, list): + mask_native: pd.Series[Any] | list[bool] = predicate + else: + # `[0]` is safe as the predicate's expression only returns a single column + mask = self._evaluate_into_exprs(predicate)[0] + mask_native = self._extract_comparand(mask) + return self._with_native( + self.native.loc[mask_native], validate_column_names=False + ) + + def with_columns( + self: PandasLikeDataFrame, *exprs: PandasLikeExpr + ) -> PandasLikeDataFrame: + columns = self._evaluate_into_exprs(*exprs) + if not columns and len(self) == 0: + return self + name_columns: dict[str, PandasLikeSeries] = {s.name: s for s in columns} + to_concat = [] + # Make sure to preserve column order + for name in self.native.columns: + if name in name_columns: + series = self._extract_comparand(name_columns.pop(name)) + else: + series = self.native[name] + to_concat.append(series) + to_concat.extend(self._extract_comparand(s) for s in name_columns.values()) + namespace = self.__narwhals_namespace__() + df = namespace._concat_horizontal(to_concat) + # `concat` creates a new object, so fine to modify `.columns.name` inplace. + df.columns.name = self.native.columns.name + return self._with_native(df, validate_column_names=False) + + def rename(self, mapping: Mapping[str, str]) -> Self: + return self._with_native( + rename( + self.native, + columns=mapping, + implementation=self._implementation, + backend_version=self._backend_version, + ) + ) + + def drop(self, columns: Sequence[str], *, strict: bool) -> Self: + to_drop = parse_columns_to_drop(self, columns, strict=strict) + return self._with_native( + self.native.drop(columns=to_drop), validate_column_names=False + ) + + # --- transform --- + def sort(self, *by: str, descending: bool | Sequence[bool], nulls_last: bool) -> Self: + df = self.native + if isinstance(descending, bool): + ascending: bool | list[bool] = not descending + else: + ascending = [not d for d in descending] + na_position = "last" if nulls_last else "first" + return self._with_native( + df.sort_values(list(by), ascending=ascending, na_position=na_position), + validate_column_names=False, + ) + + # --- convert --- + def collect( + self, backend: Implementation | None, **kwargs: Any + ) -> CompliantDataFrameAny: + if backend is None: + return PandasLikeDataFrame( + self.native, + implementation=self._implementation, + backend_version=self._backend_version, + version=self._version, + validate_column_names=False, + ) + + if backend is Implementation.PANDAS: + import pandas as pd # ignore-banned-import + + return PandasLikeDataFrame( + self.to_pandas(), + implementation=Implementation.PANDAS, + backend_version=parse_version(pd), + version=self._version, + validate_column_names=False, + ) + + if backend is Implementation.PYARROW: + import pyarrow as pa # ignore-banned-import + + from narwhals._arrow.dataframe import ArrowDataFrame + + return ArrowDataFrame( + native_dataframe=self.to_arrow(), + backend_version=parse_version(pa), + version=self._version, + validate_column_names=False, + ) + + if backend is Implementation.POLARS: + import polars as pl # ignore-banned-import + + from narwhals._polars.dataframe import PolarsDataFrame + + return PolarsDataFrame( + df=self.to_polars(), + backend_version=parse_version(pl), + version=self._version, + ) + + msg = f"Unsupported `backend` value: {backend}" # pragma: no cover + raise ValueError(msg) # pragma: no cover + + # --- actions --- + def group_by( + self, keys: Sequence[str] | Sequence[PandasLikeExpr], *, drop_null_keys: bool + ) -> PandasLikeGroupBy: + from narwhals._pandas_like.group_by import PandasLikeGroupBy + + return PandasLikeGroupBy(self, keys, drop_null_keys=drop_null_keys) + + def join( # noqa: C901, PLR0911, PLR0912 + self, + other: Self, + *, + how: JoinStrategy, + left_on: Sequence[str] | None, + right_on: Sequence[str] | None, + suffix: str, + ) -> Self: + if how == "cross": + if ( + self._implementation is Implementation.MODIN + or self._implementation is Implementation.CUDF + ) or ( + self._implementation is Implementation.PANDAS + and self._backend_version < (1, 4) + ): + key_token = generate_temporary_column_name( + n_bytes=8, columns=[*self.columns, *other.columns] + ) + + return self._with_native( + self.native.assign(**{key_token: 0}) + .merge( + other.native.assign(**{key_token: 0}), + how="inner", + left_on=key_token, + right_on=key_token, + suffixes=("", suffix), + ) + .drop(columns=key_token) + ) + else: + return self._with_native( + self.native.merge(other.native, how="cross", suffixes=("", suffix)) + ) + + if how == "anti": + if self._implementation is Implementation.CUDF: + return self._with_native( + self.native.merge( + other.native, how="leftanti", left_on=left_on, right_on=right_on + ) + ) + else: + indicator_token = generate_temporary_column_name( + n_bytes=8, columns=[*self.columns, *other.columns] + ) + if right_on is None: # pragma: no cover + msg = "`right_on` cannot be `None` in anti-join" + raise TypeError(msg) + + # rename to avoid creating extra columns in join + other_native = rename( + select_columns_by_name( + other.native, + list(right_on), + self._backend_version, + self._implementation, + ), + columns=dict(zip(right_on, left_on)), # type: ignore[arg-type] + implementation=self._implementation, + backend_version=self._backend_version, + ).drop_duplicates() + return self._with_native( + self.native.merge( + other_native, + how="outer", + indicator=indicator_token, + left_on=left_on, + right_on=left_on, + ) + .loc[lambda t: t[indicator_token] == "left_only"] + .drop(columns=indicator_token) + ) + + if how == "semi": + if right_on is None: # pragma: no cover + msg = "`right_on` cannot be `None` in semi-join" + raise TypeError(msg) + # rename to avoid creating extra columns in join + other_native = ( + rename( + select_columns_by_name( + other.native, + list(right_on), + self._backend_version, + self._implementation, + ), + columns=dict(zip(right_on, left_on)), # type: ignore[arg-type] + implementation=self._implementation, + backend_version=self._backend_version, + ).drop_duplicates() # avoids potential rows duplication from inner join + ) + return self._with_native( + self.native.merge( + other_native, how="inner", left_on=left_on, right_on=left_on + ) + ) + + if how == "left": + result_native = self.native.merge( + other.native, + how="left", + left_on=left_on, + right_on=right_on, + suffixes=("", suffix), + ) + extra = [] + for left_key, right_key in zip(left_on, right_on): # type: ignore[arg-type] + if right_key != left_key and right_key not in self.columns: + extra.append(right_key) + elif right_key != left_key: + extra.append(f"{right_key}{suffix}") + return self._with_native(result_native.drop(columns=extra)) + + if how == "full": + # Pandas coalesces keys in full joins unless there's no collision + + # help mypy + assert left_on is not None # noqa: S101 + assert right_on is not None # noqa: S101 + + right_on_mapper = _remap_full_join_keys(left_on, right_on, suffix) + other_native = other.native.rename(columns=right_on_mapper) + check_column_names_are_unique(other_native.columns) + right_on = list(right_on_mapper.values()) # we now have the suffixed keys + return self._with_native( + self.native.merge( + other_native, + left_on=left_on, + right_on=right_on, + how="outer", + suffixes=("", suffix), + ) + ) + + return self._with_native( + self.native.merge( + other.native, + left_on=left_on, + right_on=right_on, + how=how, + suffixes=("", suffix), + ) + ) + + def join_asof( + self, + other: Self, + *, + left_on: str, + right_on: str, + by_left: Sequence[str] | None, + by_right: Sequence[str] | None, + strategy: AsofJoinStrategy, + suffix: str, + ) -> Self: + plx = self.__native_namespace__() + return self._with_native( + plx.merge_asof( + self.native, + other.native, + left_on=left_on, + right_on=right_on, + left_by=by_left, + right_by=by_right, + direction=strategy, + suffixes=("", suffix), + ) + ) + + # --- partial reduction --- + + def head(self, n: int) -> Self: + return self._with_native(self.native.head(n), validate_column_names=False) + + def tail(self, n: int) -> Self: + return self._with_native(self.native.tail(n), validate_column_names=False) + + def unique( + self, + subset: Sequence[str] | None, + *, + keep: UniqueKeepStrategy, + maintain_order: bool | None = None, + ) -> Self: + # The param `maintain_order` is only here for compatibility with the Polars API + # and has no effect on the output. + mapped_keep = {"none": False, "any": "first"}.get(keep, keep) + if subset and (error := self._check_columns_exist(subset)): + raise error + return self._with_native( + self.native.drop_duplicates(subset=subset, keep=mapped_keep), + validate_column_names=False, + ) + + # --- lazy-only --- + def lazy(self, *, backend: Implementation | None = None) -> CompliantLazyFrameAny: + from narwhals.utils import parse_version + + pandas_df = self.to_pandas() + if backend is None: + return self + elif backend is Implementation.DUCKDB: + import duckdb # ignore-banned-import + + from narwhals._duckdb.dataframe import DuckDBLazyFrame + + return DuckDBLazyFrame( + df=duckdb.table("pandas_df"), + backend_version=parse_version(duckdb), + version=self._version, + ) + elif backend is Implementation.POLARS: + import polars as pl # ignore-banned-import + + from narwhals._polars.dataframe import PolarsLazyFrame + + return PolarsLazyFrame( + df=pl.from_pandas(pandas_df).lazy(), + backend_version=parse_version(pl), + version=self._version, + ) + elif backend is Implementation.DASK: + import dask # ignore-banned-import + import dask.dataframe as dd # ignore-banned-import + + from narwhals._dask.dataframe import DaskLazyFrame + + return DaskLazyFrame( + native_dataframe=dd.from_pandas(pandas_df), + backend_version=parse_version(dask), + version=self._version, + ) + raise AssertionError # pragma: no cover + + @property + def shape(self) -> tuple[int, int]: + return self.native.shape + + def to_dict(self, *, as_series: bool) -> dict[str, Any]: + if as_series: + return { + col: PandasLikeSeries.from_native(self.native[col], context=self) + for col in self.columns + } + return self.native.to_dict(orient="list") + + def to_numpy(self, dtype: Any = None, *, copy: bool | None = None) -> _2DArray: + native_dtypes = self.native.dtypes + + if copy is None: + # pandas default differs from Polars, but cuDF default is True + copy = self._implementation is Implementation.CUDF + + if native_dtypes.isin(CLASSICAL_NUMPY_DTYPES).all(): + # Fast path, no conversions necessary. + if dtype is not None: + return self.native.to_numpy(dtype=dtype, copy=copy) + return self.native.to_numpy(copy=copy) + + dtype_datetime = self._version.dtypes.Datetime + to_convert = [ + key + for key, val in self.schema.items() + if isinstance(val, dtype_datetime) and val.time_zone is not None + ] + if to_convert: + df = self.with_columns( + self.__narwhals_namespace__() + .col(*to_convert) + .dt.convert_time_zone("UTC") + .dt.replace_time_zone(None) + ).native + else: + df = self.native + + if dtype is not None: + return df.to_numpy(dtype=dtype, copy=copy) + + # pandas return `object` dtype for nullable dtypes if dtype=None, + # so we cast each Series to numpy and let numpy find a common dtype. + # If there aren't any dtypes where `to_numpy()` is "broken" (i.e. it + # returns Object) then we just call `to_numpy()` on the DataFrame. + for col_dtype in native_dtypes: + if str(col_dtype) in PANDAS_TO_NUMPY_DTYPE_MISSING: + import numpy as np + + arr: Any = np.hstack( + [ + self.get_column(col).to_numpy(copy=copy, dtype=None)[:, None] + for col in self.columns + ] + ) + return arr + return df.to_numpy(copy=copy) + + def to_pandas(self) -> pd.DataFrame: + if self._implementation is Implementation.PANDAS: + return self.native + elif self._implementation is Implementation.CUDF: + return self.native.to_pandas() + elif self._implementation is Implementation.MODIN: + return self.native._to_pandas() + msg = f"Unknown implementation: {self._implementation}" # pragma: no cover + raise AssertionError(msg) + + def to_polars(self) -> pl.DataFrame: + import polars as pl # ignore-banned-import + + return pl.from_pandas(self.to_pandas()) + + def write_parquet(self, file: str | Path | BytesIO) -> None: + self.native.to_parquet(file) + + @overload + def write_csv(self, file: None) -> str: ... + + @overload + def write_csv(self, file: str | Path | BytesIO) -> None: ... + + def write_csv(self, file: str | Path | BytesIO | None) -> str | None: + return self.native.to_csv(file, index=False) + + # --- descriptive --- + def is_unique(self) -> PandasLikeSeries: + return PandasLikeSeries.from_native( + ~self.native.duplicated(keep=False), context=self + ) + + def item(self, row: int | None, column: int | str | None) -> Any: + if row is None and column is None: + if self.shape != (1, 1): + msg = ( + "can only call `.item()` if the dataframe is of shape (1, 1)," + " or if explicit row/col values are provided;" + f" frame has shape {self.shape!r}" + ) + raise ValueError(msg) + return self.native.iloc[0, 0] + + elif row is None or column is None: + msg = "cannot call `.item()` with only one of `row` or `column`" + raise ValueError(msg) + + _col = self.columns.index(column) if isinstance(column, str) else column + return self.native.iloc[row, _col] + + def clone(self) -> Self: + return self._with_native(self.native.copy(), validate_column_names=False) + + def gather_every(self, n: int, offset: int) -> Self: + return self._with_native(self.native.iloc[offset::n], validate_column_names=False) + + def _pivot_into_index_values( + self, + on: Sequence[str], + index: Sequence[str] | None, + values: Sequence[str] | None, + /, + ) -> tuple[Sequence[str], Sequence[str]]: + index = index or ( + exclude_column_names(self, {*on, *values}) + if values + else exclude_column_names(self, on) + ) + values = values or exclude_column_names(self, {*on, *index}) + return index, values + + @staticmethod + def _pivot_multi_on_name(unique_values: tuple[str, ...], /) -> str: + LB, RB, Q = "{", "}", '"' # noqa: N806 + body = '","'.join(unique_values) + return f"{LB}{Q}{body}{Q}{RB}" + + @staticmethod + def _pivot_single_on_names( + column_names: Iterable[str], n_values: int, separator: str, / + ) -> list[str]: + if n_values > 1: + return [separator.join(col).strip() for col in column_names] + return [col[-1] for col in column_names] + + def _pivot_multi_on_names( + self, + column_names: Iterable[tuple[str, ...]], + n_on: int, + n_values: int, + separator: str, + /, + ) -> Iterator[str]: + if n_values > 1: + for col in column_names: + names = col[-n_on:] + prefix = col[0] + yield separator.join((prefix, self._pivot_multi_on_name(names))) + else: + for col in column_names: + yield self._pivot_multi_on_name(col[-n_on:]) + + def _pivot_remap_column_names( + self, column_names: Iterable[Any], *, n_on: int, n_values: int, separator: str + ) -> list[str]: + """Reformat output column names from a native pivot operation, to match `polars`. + + Note: + `column_names` is a `pd.MultiIndex`, but not in the stubs. + """ + if n_on == 1: + return self._pivot_single_on_names(column_names, n_values, separator) + return list(self._pivot_multi_on_names(column_names, n_on, n_values, separator)) + + def _pivot_table( + self, + on: Sequence[str], + index: Sequence[str], + values: Sequence[str], + aggregate_function: Literal[ + "min", "max", "first", "last", "sum", "mean", "median" + ], + /, + ) -> Any: + categorical = self._version.dtypes.Categorical + kwds: dict[Any, Any] = {"observed": True} + if self._implementation is Implementation.CUDF: + kwds.pop("observed") + cols = set(chain(values, index, on)) + schema = self.schema.items() + if any( + tp for name, tp in schema if name in cols and isinstance(tp, categorical) + ): + msg = "`pivot` with Categoricals is not implemented for cuDF backend" + raise NotImplementedError(msg) + return self.native.pivot_table( + values=values, + index=index, + columns=on, + aggfunc=aggregate_function, + margins=False, + **kwds, + ) + + def _pivot( + self, + on: Sequence[str], + index: Sequence[str], + values: Sequence[str], + aggregate_function: PivotAgg | None, + /, + ) -> pd.DataFrame: + if aggregate_function is None: + return self.native.pivot(columns=on, index=index, values=values) + elif aggregate_function == "len": + return ( + self.native.groupby([*on, *index], as_index=False) + .agg(dict.fromkeys(values, "size")) + .pivot(columns=on, index=index, values=values) + ) + return self._pivot_table(on, index, values, aggregate_function) + + def pivot( + self, + on: Sequence[str], + *, + index: Sequence[str] | None, + values: Sequence[str] | None, + aggregate_function: PivotAgg | None, + sort_columns: bool, + separator: str, + ) -> Self: + implementation = self._implementation + backend_version = self._backend_version + if implementation.is_pandas() and backend_version < (1, 1): # pragma: no cover + msg = "pivot is only supported for 'pandas>=1.1'" + raise NotImplementedError(msg) + if implementation.is_modin(): + msg = "pivot is not supported for Modin backend due to https://github.com/modin-project/modin/issues/7409." + raise NotImplementedError(msg) + + index, values = self._pivot_into_index_values(on, index, values) + result = self._pivot(on, index, values, aggregate_function) + + # Select the columns in the right order + uniques = ( + ( + self.get_column(col) + .unique() + .sort(descending=False, nulls_last=False) + .to_list() + for col in on + ) + if sort_columns + else (self.get_column(col).unique().to_list() for col in on) + ) + ordered_cols = list(product(values, *chain(uniques))) + result = result.loc[:, ordered_cols] + columns = result.columns + remapped = self._pivot_remap_column_names( + columns, n_on=len(on), n_values=len(values), separator=separator + ) + result.columns = remapped # type: ignore[assignment] + result.columns.names = [""] + return self._with_native(result.reset_index()) + + def to_arrow(self) -> Any: + if self._implementation is Implementation.CUDF: + return self.native.to_arrow(preserve_index=False) + + import pyarrow as pa # ignore-banned-import() + + return pa.Table.from_pandas(self.native) + + def sample( + self, + n: int | None, + *, + fraction: float | None, + with_replacement: bool, + seed: int | None, + ) -> Self: + return self._with_native( + self.native.sample( + n=n, frac=fraction, replace=with_replacement, random_state=seed + ), + validate_column_names=False, + ) + + def unpivot( + self, + on: Sequence[str] | None, + index: Sequence[str] | None, + variable_name: str, + value_name: str, + ) -> Self: + return self._with_native( + self.native.melt( + id_vars=index, + value_vars=on, + var_name=variable_name, + value_name=value_name, + ) + ) + + def explode(self, columns: Sequence[str]) -> Self: + dtypes = self._version.dtypes + + schema = self.collect_schema() + for col_to_explode in columns: + dtype = schema[col_to_explode] + + if dtype != dtypes.List: + msg = ( + f"`explode` operation not supported for dtype `{dtype}`, " + "expected List type" + ) + raise InvalidOperationError(msg) + + if len(columns) == 1: + return self._with_native( + self.native.explode(columns[0]), validate_column_names=False + ) + else: + native_frame = self.native + anchor_series = native_frame[columns[0]].list.len() + + if not all( + (native_frame[col_name].list.len() == anchor_series).all() + for col_name in columns[1:] + ): + msg = "exploded columns must have matching element counts" + raise ShapeError(msg) + + original_columns = self.columns + other_columns = [c for c in original_columns if c not in columns] + + exploded_frame = native_frame[[*other_columns, columns[0]]].explode( + columns[0] + ) + exploded_series = [ + native_frame[col_name].explode().to_frame() for col_name in columns[1:] + ] + + plx = self.__native_namespace__() + return self._with_native( + plx.concat([exploded_frame, *exploded_series], axis=1)[original_columns], + validate_column_names=False, + ) diff --git a/venv/lib/python3.8/site-packages/narwhals/_pandas_like/expr.py b/venv/lib/python3.8/site-packages/narwhals/_pandas_like/expr.py new file mode 100644 index 0000000..0cd9958 --- /dev/null +++ b/venv/lib/python3.8/site-packages/narwhals/_pandas_like/expr.py @@ -0,0 +1,402 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Sequence + +from narwhals._compliant import EagerExpr +from narwhals._expression_parsing import evaluate_output_names_and_aliases +from narwhals._pandas_like.group_by import PandasLikeGroupBy +from narwhals._pandas_like.series import PandasLikeSeries +from narwhals._utils import generate_temporary_column_name + +if TYPE_CHECKING: + from typing_extensions import Self + + from narwhals._compliant.typing import AliasNames, EvalNames, EvalSeries, ScalarKwargs + from narwhals._expression_parsing import ExprMetadata + from narwhals._pandas_like.dataframe import PandasLikeDataFrame + from narwhals._pandas_like.namespace import PandasLikeNamespace + from narwhals._utils import Implementation, Version, _FullContext + from narwhals.typing import ( + FillNullStrategy, + NonNestedLiteral, + PythonLiteral, + RankMethod, + ) + +WINDOW_FUNCTIONS_TO_PANDAS_EQUIVALENT = { + "cum_sum": "cumsum", + "cum_min": "cummin", + "cum_max": "cummax", + "cum_prod": "cumprod", + # Pandas cumcount starts counting from 0 while Polars starts from 1 + # Pandas cumcount counts nulls while Polars does not + # So, instead of using "cumcount" we use "cumsum" on notna() to get the same result + "cum_count": "cumsum", + "rolling_sum": "sum", + "rolling_mean": "mean", + "rolling_std": "std", + "rolling_var": "var", + "shift": "shift", + "rank": "rank", + "diff": "diff", + "fill_null": "fillna", +} + + +def window_kwargs_to_pandas_equivalent( + function_name: str, kwargs: ScalarKwargs +) -> dict[str, PythonLiteral]: + if function_name == "shift": + assert "n" in kwargs # noqa: S101 + pandas_kwargs: dict[str, PythonLiteral] = {"periods": kwargs["n"]} + elif function_name == "rank": + assert "method" in kwargs # noqa: S101 + assert "descending" in kwargs # noqa: S101 + _method = kwargs["method"] + pandas_kwargs = { + "method": "first" if _method == "ordinal" else _method, + "ascending": not kwargs["descending"], + "na_option": "keep", + "pct": False, + } + elif function_name.startswith("cum_"): # Cumulative operation + pandas_kwargs = {"skipna": True} + elif function_name.startswith("rolling_"): # Rolling operation + assert "min_samples" in kwargs # noqa: S101 + assert "window_size" in kwargs # noqa: S101 + assert "center" in kwargs # noqa: S101 + pandas_kwargs = { + "min_periods": kwargs["min_samples"], + "window": kwargs["window_size"], + "center": kwargs["center"], + } + elif function_name in {"std", "var"}: + assert "ddof" in kwargs # noqa: S101 + pandas_kwargs = {"ddof": kwargs["ddof"]} + elif function_name == "fill_null": + assert "strategy" in kwargs # noqa: S101 + assert "limit" in kwargs # noqa: S101 + pandas_kwargs = {"strategy": kwargs["strategy"], "limit": kwargs["limit"]} + else: # sum, len, ... + pandas_kwargs = {} + return pandas_kwargs + + +class PandasLikeExpr(EagerExpr["PandasLikeDataFrame", PandasLikeSeries]): + def __init__( + self, + call: EvalSeries[PandasLikeDataFrame, PandasLikeSeries], + *, + depth: int, + function_name: str, + evaluate_output_names: EvalNames[PandasLikeDataFrame], + alias_output_names: AliasNames | None, + implementation: Implementation, + backend_version: tuple[int, ...], + version: Version, + scalar_kwargs: ScalarKwargs | None = None, + ) -> None: + self._call = call + self._depth = depth + self._function_name = function_name + self._evaluate_output_names = evaluate_output_names + self._alias_output_names = alias_output_names + self._implementation = implementation + self._backend_version = backend_version + self._version = version + self._scalar_kwargs = scalar_kwargs or {} + self._metadata: ExprMetadata | None = None + + def __narwhals_namespace__(self) -> PandasLikeNamespace: + from narwhals._pandas_like.namespace import PandasLikeNamespace + + return PandasLikeNamespace( + self._implementation, self._backend_version, version=self._version + ) + + def __narwhals_expr__(self) -> None: ... + + @classmethod + def from_column_names( + cls: type[Self], + evaluate_column_names: EvalNames[PandasLikeDataFrame], + /, + *, + context: _FullContext, + function_name: str = "", + ) -> Self: + def func(df: PandasLikeDataFrame) -> list[PandasLikeSeries]: + try: + return [ + PandasLikeSeries( + df._native_frame[column_name], + implementation=df._implementation, + backend_version=df._backend_version, + version=df._version, + ) + for column_name in evaluate_column_names(df) + ] + except KeyError as e: + if error := df._check_columns_exist(evaluate_column_names(df)): + raise error from e + raise + + return cls( + func, + depth=0, + function_name=function_name, + evaluate_output_names=evaluate_column_names, + alias_output_names=None, + implementation=context._implementation, + backend_version=context._backend_version, + version=context._version, + ) + + @classmethod + def from_column_indices(cls, *column_indices: int, context: _FullContext) -> Self: + def func(df: PandasLikeDataFrame) -> list[PandasLikeSeries]: + native = df.native + return [ + PandasLikeSeries.from_native(native.iloc[:, i], context=df) + for i in column_indices + ] + + return cls( + func, + depth=0, + function_name="nth", + evaluate_output_names=cls._eval_names_indices(column_indices), + alias_output_names=None, + implementation=context._implementation, + backend_version=context._backend_version, + version=context._version, + ) + + def ewm_mean( + self, + *, + com: float | None, + span: float | None, + half_life: float | None, + alpha: float | None, + adjust: bool, + min_samples: int, + ignore_nulls: bool, + ) -> Self: + return self._reuse_series( + "ewm_mean", + com=com, + span=span, + half_life=half_life, + alpha=alpha, + adjust=adjust, + min_samples=min_samples, + ignore_nulls=ignore_nulls, + ) + + def cum_sum(self, *, reverse: bool) -> Self: + return self._reuse_series("cum_sum", scalar_kwargs={"reverse": reverse}) + + def shift(self, n: int) -> Self: + return self._reuse_series("shift", scalar_kwargs={"n": n}) + + def over( # noqa: C901, PLR0915 + self, partition_by: Sequence[str], order_by: Sequence[str] + ) -> Self: + if not partition_by: + # e.g. `nw.col('a').cum_sum().order_by(key)` + # We can always easily support this as it doesn't require grouping. + assert order_by # noqa: S101 + + def func(df: PandasLikeDataFrame) -> Sequence[PandasLikeSeries]: + token = generate_temporary_column_name(8, df.columns) + df = df.with_row_index(token).sort( + *order_by, descending=False, nulls_last=False + ) + results = self(df.drop([token], strict=True)) + sorting_indices = df.get_column(token) + for s in results: + s._scatter_in_place(sorting_indices, s) + return results + elif not self._is_elementary(): + msg = ( + "Only elementary expressions are supported for `.over` in pandas-like backends.\n\n" + "Please see: " + "https://narwhals-dev.github.io/narwhals/concepts/improve_group_by_operation/" + ) + raise NotImplementedError(msg) + else: + function_name = PandasLikeGroupBy._leaf_name(self) + pandas_function_name = WINDOW_FUNCTIONS_TO_PANDAS_EQUIVALENT.get( + function_name, PandasLikeGroupBy._REMAP_AGGS.get(function_name) + ) + if pandas_function_name is None: + msg = ( + f"Unsupported function: {function_name} in `over` context.\n\n" + f"Supported functions are {', '.join(WINDOW_FUNCTIONS_TO_PANDAS_EQUIVALENT)}\n" + f"and {', '.join(PandasLikeGroupBy._REMAP_AGGS)}." + ) + raise NotImplementedError(msg) + pandas_kwargs = window_kwargs_to_pandas_equivalent( + function_name, self._scalar_kwargs + ) + + def func(df: PandasLikeDataFrame) -> Sequence[PandasLikeSeries]: # noqa: C901, PLR0912 + output_names, aliases = evaluate_output_names_and_aliases(self, df, []) + if function_name == "cum_count": + plx = self.__narwhals_namespace__() + df = df.with_columns(~plx.col(*output_names).is_null()) + + if function_name.startswith("cum_"): + assert "reverse" in self._scalar_kwargs # noqa: S101 + reverse = self._scalar_kwargs["reverse"] + else: + assert "reverse" not in self._scalar_kwargs # noqa: S101 + reverse = False + + if order_by: + columns = list(set(partition_by).union(output_names).union(order_by)) + token = generate_temporary_column_name(8, columns) + df = ( + df.simple_select(*columns) + .with_row_index(token) + .sort(*order_by, descending=reverse, nulls_last=reverse) + ) + sorting_indices = df.get_column(token) + elif reverse: + columns = list(set(partition_by).union(output_names)) + df = df.simple_select(*columns)._gather_slice(slice(None, None, -1)) + grouped = df._native_frame.groupby(partition_by) + if function_name.startswith("rolling"): + rolling = grouped[list(output_names)].rolling(**pandas_kwargs) + assert pandas_function_name is not None # help mypy # noqa: S101 + if pandas_function_name in {"std", "var"}: + assert "ddof" in self._scalar_kwargs # noqa: S101 + res_native = getattr(rolling, pandas_function_name)( + ddof=self._scalar_kwargs["ddof"] + ) + else: + res_native = getattr(rolling, pandas_function_name)() + elif function_name == "fill_null": + assert "strategy" in self._scalar_kwargs # noqa: S101 + assert "limit" in self._scalar_kwargs # noqa: S101 + df_grouped = grouped[list(output_names)] + if self._scalar_kwargs["strategy"] == "forward": + res_native = df_grouped.ffill(limit=self._scalar_kwargs["limit"]) + elif self._scalar_kwargs["strategy"] == "backward": + res_native = df_grouped.bfill(limit=self._scalar_kwargs["limit"]) + else: # pragma: no cover + # This is deprecated in pandas. Indeed, `nw.col('a').fill_null(3).over('b')` + # does not seem very useful, and DuckDB doesn't support it either. + msg = "`fill_null` with `over` without `strategy` specified is not supported." + raise NotImplementedError(msg) + elif function_name == "len": + if len(output_names) != 1: # pragma: no cover + msg = "Safety check failed, please report a bug." + raise AssertionError(msg) + res_native = grouped.transform("size").to_frame(aliases[0]) + else: + res_native = grouped[list(output_names)].transform( + pandas_function_name, **pandas_kwargs + ) + result_frame = df._with_native(res_native).rename( + dict(zip(output_names, aliases)) + ) + results = [result_frame.get_column(name) for name in aliases] + if order_by: + for s in results: + s._scatter_in_place(sorting_indices, s) + return results + if reverse: + return [s._gather_slice(slice(None, None, -1)) for s in results] + return results + + return self.__class__( + func, + depth=self._depth + 1, + function_name=self._function_name + "->over", + evaluate_output_names=self._evaluate_output_names, + alias_output_names=self._alias_output_names, + implementation=self._implementation, + backend_version=self._backend_version, + version=self._version, + ) + + def cum_count(self, *, reverse: bool) -> Self: + return self._reuse_series("cum_count", scalar_kwargs={"reverse": reverse}) + + def cum_min(self, *, reverse: bool) -> Self: + return self._reuse_series("cum_min", scalar_kwargs={"reverse": reverse}) + + def cum_max(self, *, reverse: bool) -> Self: + return self._reuse_series("cum_max", scalar_kwargs={"reverse": reverse}) + + def cum_prod(self, *, reverse: bool) -> Self: + return self._reuse_series("cum_prod", scalar_kwargs={"reverse": reverse}) + + def fill_null( + self, + value: Self | NonNestedLiteral, + strategy: FillNullStrategy | None, + limit: int | None, + ) -> Self: + return self._reuse_series( + "fill_null", scalar_kwargs={"strategy": strategy, "limit": limit}, value=value + ) + + def rolling_sum(self, window_size: int, *, min_samples: int, center: bool) -> Self: + return self._reuse_series( + "rolling_sum", + scalar_kwargs={ + "window_size": window_size, + "min_samples": min_samples, + "center": center, + }, + ) + + def rolling_mean(self, window_size: int, *, min_samples: int, center: bool) -> Self: + return self._reuse_series( + "rolling_mean", + scalar_kwargs={ + "window_size": window_size, + "min_samples": min_samples, + "center": center, + }, + ) + + def rolling_std( + self, window_size: int, *, min_samples: int, center: bool, ddof: int + ) -> Self: + return self._reuse_series( + "rolling_std", + scalar_kwargs={ + "window_size": window_size, + "min_samples": min_samples, + "center": center, + "ddof": ddof, + }, + ) + + def rolling_var( + self, window_size: int, *, min_samples: int, center: bool, ddof: int + ) -> Self: + return self._reuse_series( + "rolling_var", + scalar_kwargs={ + "window_size": window_size, + "min_samples": min_samples, + "center": center, + "ddof": ddof, + }, + ) + + def rank(self, method: RankMethod, *, descending: bool) -> Self: + return self._reuse_series( + "rank", scalar_kwargs={"method": method, "descending": descending} + ) + + def log(self, base: float) -> Self: + return self._reuse_series("log", base=base) + + def exp(self) -> Self: + return self._reuse_series("exp") diff --git a/venv/lib/python3.8/site-packages/narwhals/_pandas_like/group_by.py b/venv/lib/python3.8/site-packages/narwhals/_pandas_like/group_by.py new file mode 100644 index 0000000..ede3f05 --- /dev/null +++ b/venv/lib/python3.8/site-packages/narwhals/_pandas_like/group_by.py @@ -0,0 +1,293 @@ +from __future__ import annotations + +import collections +import warnings +from typing import TYPE_CHECKING, Any, ClassVar, Iterator, Mapping, Sequence + +from narwhals._compliant import EagerGroupBy +from narwhals._expression_parsing import evaluate_output_names_and_aliases +from narwhals._pandas_like.utils import select_columns_by_name +from narwhals._utils import find_stacklevel + +if TYPE_CHECKING: + from narwhals._compliant.group_by import NarwhalsAggregation + from narwhals._pandas_like.dataframe import PandasLikeDataFrame + from narwhals._pandas_like.expr import PandasLikeExpr + + +class PandasLikeGroupBy(EagerGroupBy["PandasLikeDataFrame", "PandasLikeExpr", str]): + _REMAP_AGGS: ClassVar[Mapping[NarwhalsAggregation, Any]] = { + "sum": "sum", + "mean": "mean", + "median": "median", + "max": "max", + "min": "min", + "std": "std", + "var": "var", + "len": "size", + "n_unique": "nunique", + "count": "count", + } + + def __init__( + self, + df: PandasLikeDataFrame, + keys: Sequence[PandasLikeExpr] | Sequence[str], + /, + *, + drop_null_keys: bool, + ) -> None: + self._df = df + self._drop_null_keys = drop_null_keys + self._compliant_frame, self._keys, self._output_key_names = self._parse_keys( + df, keys=keys + ) + # Drop index to avoid potential collisions: + # https://github.com/narwhals-dev/narwhals/issues/1907. + if set(self.compliant.native.index.names).intersection(self.compliant.columns): + native_frame = self.compliant.native.reset_index(drop=True) + else: + native_frame = self.compliant.native + if ( + self.compliant._implementation.is_pandas() + and self.compliant._backend_version < (1, 1) + ): # pragma: no cover + if ( + not drop_null_keys + and self.compliant.simple_select(*self._keys).native.isna().any().any() + ): + msg = "Grouping by null values is not supported in pandas < 1.1.0" + raise NotImplementedError(msg) + self._grouped = native_frame.groupby( + list(self._keys), sort=False, as_index=True, observed=True + ) + else: + self._grouped = native_frame.groupby( + list(self._keys), + sort=False, + as_index=True, + dropna=drop_null_keys, + observed=True, + ) + + def agg(self, *exprs: PandasLikeExpr) -> PandasLikeDataFrame: # noqa: C901, PLR0912, PLR0914, PLR0915 + implementation = self.compliant._implementation + backend_version = self.compliant._backend_version + new_names: list[str] = self._keys.copy() + + all_aggs_are_simple = True + exclude = (*self._keys, *self._output_key_names) + for expr in exprs: + _, aliases = evaluate_output_names_and_aliases(expr, self.compliant, exclude) + new_names.extend(aliases) + if not self._is_simple(expr): + all_aggs_are_simple = False + + # dict of {output_name: root_name} that we count n_unique on + # We need to do this separately from the rest so that we + # can pass the `dropna` kwargs. + nunique_aggs: dict[str, str] = {} + simple_aggs: dict[str, list[str]] = collections.defaultdict(list) + simple_aggs_functions: set[str] = set() + + # ddof to (output_names, aliases) mapping + std_aggs: dict[int, tuple[list[str], list[str]]] = collections.defaultdict( + lambda: ([], []) + ) + var_aggs: dict[int, tuple[list[str], list[str]]] = collections.defaultdict( + lambda: ([], []) + ) + + expected_old_names: list[str] = [] + simple_agg_new_names: list[str] = [] + + if all_aggs_are_simple: # noqa: PLR1702 + for expr in exprs: + output_names, aliases = evaluate_output_names_and_aliases( + expr, self.compliant, exclude + ) + if expr._depth == 0: + # e.g. `agg(nw.len())` + function_name = self._remap_expr_name(expr._function_name) + simple_aggs_functions.add(function_name) + + for alias in aliases: + expected_old_names.append(f"{self._keys[0]}_{function_name}") + simple_aggs[self._keys[0]].append(function_name) + simple_agg_new_names.append(alias) + continue + + # e.g. `agg(nw.mean('a'))` + function_name = self._remap_expr_name(self._leaf_name(expr)) + is_n_unique = function_name == "nunique" + is_std = function_name == "std" + is_var = function_name == "var" + for output_name, alias in zip(output_names, aliases): + if is_n_unique: + nunique_aggs[alias] = output_name + elif is_std and (ddof := expr._scalar_kwargs["ddof"]) != 1: # pyright: ignore[reportTypedDictNotRequiredAccess] + std_aggs[ddof][0].append(output_name) + std_aggs[ddof][1].append(alias) + elif is_var and (ddof := expr._scalar_kwargs["ddof"]) != 1: # pyright: ignore[reportTypedDictNotRequiredAccess] + var_aggs[ddof][0].append(output_name) + var_aggs[ddof][1].append(alias) + else: + expected_old_names.append(f"{output_name}_{function_name}") + simple_aggs[output_name].append(function_name) + simple_agg_new_names.append(alias) + simple_aggs_functions.add(function_name) + + result_aggs = [] + + if simple_aggs: + # Fast path for single aggregation such as `df.groupby(...).mean()` + if ( + len(simple_aggs_functions) == 1 + and (agg_method := simple_aggs_functions.pop()) != "size" + and len(simple_aggs) > 1 + ): + result_simple_aggs = getattr( + self._grouped[list(simple_aggs.keys())], agg_method + )() + result_simple_aggs.columns = [ + f"{a}_{agg_method}" for a in result_simple_aggs.columns + ] + else: + result_simple_aggs = self._grouped.agg(simple_aggs) + result_simple_aggs.columns = [ + f"{a}_{b}" for a, b in result_simple_aggs.columns + ] + if not ( + set(result_simple_aggs.columns) == set(expected_old_names) + and len(result_simple_aggs.columns) == len(expected_old_names) + ): # pragma: no cover + msg = ( + f"Safety assertion failed, expected {expected_old_names} " + f"got {result_simple_aggs.columns}, " + "please report a bug at https://github.com/narwhals-dev/narwhals/issues" + ) + raise AssertionError(msg) + + # Rename columns, being very careful + expected_old_names_indices: dict[str, list[int]] = ( + collections.defaultdict(list) + ) + for idx, item in enumerate(expected_old_names): + expected_old_names_indices[item].append(idx) + index_map: list[int] = [ + expected_old_names_indices[item].pop(0) + for item in result_simple_aggs.columns + ] + result_simple_aggs.columns = [simple_agg_new_names[i] for i in index_map] + result_aggs.append(result_simple_aggs) + + if nunique_aggs: + result_nunique_aggs = self._grouped[list(nunique_aggs.values())].nunique( + dropna=False + ) + result_nunique_aggs.columns = list(nunique_aggs.keys()) + + result_aggs.append(result_nunique_aggs) + + if std_aggs: + for ddof, (std_output_names, std_aliases) in std_aggs.items(): + _aggregation = self._grouped[std_output_names].std(ddof=ddof) + # `_aggregation` is a new object so it's OK to operate inplace. + _aggregation.columns = std_aliases + result_aggs.append(_aggregation) + if var_aggs: + for ddof, (var_output_names, var_aliases) in var_aggs.items(): + _aggregation = self._grouped[var_output_names].var(ddof=ddof) + # `_aggregation` is a new object so it's OK to operate inplace. + _aggregation.columns = var_aliases + result_aggs.append(_aggregation) + + if result_aggs: + output_names_counter = collections.Counter( + c for frame in result_aggs for c in frame + ) + if any(v > 1 for v in output_names_counter.values()): + msg = "" + for key, value in output_names_counter.items(): + if value > 1: + msg += f"\n- '{key}' {value} times" + else: # pragma: no cover + pass + msg = f"Expected unique output names, got:{msg}" + raise ValueError(msg) + namespace = self.compliant.__narwhals_namespace__() + result = namespace._concat_horizontal(result_aggs) + else: + # No aggregation provided + result = self.compliant.__native_namespace__().DataFrame( + list(self._grouped.groups.keys()), columns=self._keys + ) + # Keep inplace=True to avoid making a redundant copy. + # This may need updating, depending on https://github.com/pandas-dev/pandas/pull/51466/files + result.reset_index(inplace=True) # noqa: PD002 + return self.compliant._with_native( + select_columns_by_name(result, new_names, backend_version, implementation) + ).rename(dict(zip(self._keys, self._output_key_names))) + + if self.compliant.native.empty: + # Don't even attempt this, it's way too inconsistent across pandas versions. + msg = ( + "No results for group-by aggregation.\n\n" + "Hint: you were probably trying to apply a non-elementary aggregation with a " + "pandas-like API.\n" + "Please rewrite your query such that group-by aggregations " + "are elementary. For example, instead of:\n\n" + " df.group_by('a').agg(nw.col('b').round(2).mean())\n\n" + "use:\n\n" + " df.with_columns(nw.col('b').round(2)).group_by('a').agg(nw.col('b').mean())\n\n" + ) + raise ValueError(msg) + + warnings.warn( + "Found complex group-by expression, which can't be expressed efficiently with the " + "pandas API. If you can, please rewrite your query such that group-by aggregations " + "are simple (e.g. mean, std, min, max, ...). \n\n" + "Please see: " + "https://narwhals-dev.github.io/narwhals/concepts/improve_group_by_operation/", + UserWarning, + stacklevel=find_stacklevel(), + ) + + def func(df: Any) -> Any: + out_group = [] + out_names = [] + for expr in exprs: + results_keys = expr(self.compliant._with_native(df)) + for result_keys in results_keys: + out_group.append(result_keys.native.iloc[0]) + out_names.append(result_keys.name) + ns = self.compliant.__narwhals_namespace__() + return ns._series.from_iterable(out_group, index=out_names, context=ns).native + + if implementation.is_pandas() and backend_version >= (2, 2): + result_complex = self._grouped.apply(func, include_groups=False) + else: # pragma: no cover + result_complex = self._grouped.apply(func) + + # Keep inplace=True to avoid making a redundant copy. + # This may need updating, depending on https://github.com/pandas-dev/pandas/pull/51466/files + result_complex.reset_index(inplace=True) # noqa: PD002 + return self.compliant._with_native( + select_columns_by_name( + result_complex, new_names, backend_version, implementation + ) + ).rename(dict(zip(self._keys, self._output_key_names))) + + def __iter__(self) -> Iterator[tuple[Any, PandasLikeDataFrame]]: + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + message=".*a length 1 tuple will be returned", + category=FutureWarning, + ) + + for key, group in self._grouped: + yield ( + key, + self.compliant._with_native(group).simple_select(*self._df.columns), + ) diff --git a/venv/lib/python3.8/site-packages/narwhals/_pandas_like/namespace.py b/venv/lib/python3.8/site-packages/narwhals/_pandas_like/namespace.py new file mode 100644 index 0000000..5612c85 --- /dev/null +++ b/venv/lib/python3.8/site-packages/narwhals/_pandas_like/namespace.py @@ -0,0 +1,332 @@ +from __future__ import annotations + +import operator +import warnings +from functools import reduce +from typing import TYPE_CHECKING, Literal, Sequence + +import pandas as pd + +from narwhals._compliant import CompliantThen, EagerNamespace, EagerWhen +from narwhals._expression_parsing import ( + combine_alias_output_names, + combine_evaluate_output_names, +) +from narwhals._pandas_like.dataframe import PandasLikeDataFrame +from narwhals._pandas_like.expr import PandasLikeExpr +from narwhals._pandas_like.selectors import PandasSelectorNamespace +from narwhals._pandas_like.series import PandasLikeSeries +from narwhals._pandas_like.utils import align_series_full_broadcast + +if TYPE_CHECKING: + from narwhals._pandas_like.typing import NDFrameT + from narwhals._utils import Implementation, Version + from narwhals.typing import IntoDType, NonNestedLiteral + +VERTICAL: Literal[0] = 0 +HORIZONTAL: Literal[1] = 1 + + +class PandasLikeNamespace( + EagerNamespace[PandasLikeDataFrame, PandasLikeSeries, PandasLikeExpr, pd.DataFrame] +): + @property + def _dataframe(self) -> type[PandasLikeDataFrame]: + return PandasLikeDataFrame + + @property + def _expr(self) -> type[PandasLikeExpr]: + return PandasLikeExpr + + @property + def _series(self) -> type[PandasLikeSeries]: + return PandasLikeSeries + + @property + def selectors(self) -> PandasSelectorNamespace: + return PandasSelectorNamespace.from_namespace(self) + + # --- not in spec --- + def __init__( + self, + implementation: Implementation, + backend_version: tuple[int, ...], + version: Version, + ) -> None: + self._implementation = implementation + self._backend_version = backend_version + self._version = version + + def lit(self, value: NonNestedLiteral, dtype: IntoDType | None) -> PandasLikeExpr: + def _lit_pandas_series(df: PandasLikeDataFrame) -> PandasLikeSeries: + pandas_series = self._series.from_iterable( + data=[value], + name="literal", + index=df._native_frame.index[0:1], + context=self, + ) + if dtype: + return pandas_series.cast(dtype) + return pandas_series + + return PandasLikeExpr( + lambda df: [_lit_pandas_series(df)], + depth=0, + function_name="lit", + evaluate_output_names=lambda _df: ["literal"], + alias_output_names=None, + implementation=self._implementation, + backend_version=self._backend_version, + version=self._version, + ) + + def len(self) -> PandasLikeExpr: + return PandasLikeExpr( + lambda df: [ + self._series.from_iterable( + [len(df._native_frame)], name="len", index=[0], context=self + ) + ], + depth=0, + function_name="len", + evaluate_output_names=lambda _df: ["len"], + alias_output_names=None, + implementation=self._implementation, + backend_version=self._backend_version, + version=self._version, + ) + + # --- horizontal --- + def sum_horizontal(self, *exprs: PandasLikeExpr) -> PandasLikeExpr: + def func(df: PandasLikeDataFrame) -> list[PandasLikeSeries]: + series = [s for _expr in exprs for s in _expr(df)] + series = align_series_full_broadcast(*series) + native_series = (s.fill_null(0, None, None) for s in series) + return [reduce(operator.add, native_series)] + + return self._expr._from_callable( + func=func, + depth=max(x._depth for x in exprs) + 1, + function_name="sum_horizontal", + evaluate_output_names=combine_evaluate_output_names(*exprs), + alias_output_names=combine_alias_output_names(*exprs), + context=self, + ) + + def all_horizontal(self, *exprs: PandasLikeExpr) -> PandasLikeExpr: + def func(df: PandasLikeDataFrame) -> list[PandasLikeSeries]: + series = align_series_full_broadcast( + *(s for _expr in exprs for s in _expr(df)) + ) + return [reduce(operator.and_, series)] + + return self._expr._from_callable( + func=func, + depth=max(x._depth for x in exprs) + 1, + function_name="all_horizontal", + evaluate_output_names=combine_evaluate_output_names(*exprs), + alias_output_names=combine_alias_output_names(*exprs), + context=self, + ) + + def any_horizontal(self, *exprs: PandasLikeExpr) -> PandasLikeExpr: + def func(df: PandasLikeDataFrame) -> list[PandasLikeSeries]: + series = align_series_full_broadcast( + *(s for _expr in exprs for s in _expr(df)) + ) + return [reduce(operator.or_, series)] + + return self._expr._from_callable( + func=func, + depth=max(x._depth for x in exprs) + 1, + function_name="any_horizontal", + evaluate_output_names=combine_evaluate_output_names(*exprs), + alias_output_names=combine_alias_output_names(*exprs), + context=self, + ) + + def mean_horizontal(self, *exprs: PandasLikeExpr) -> PandasLikeExpr: + def func(df: PandasLikeDataFrame) -> list[PandasLikeSeries]: + expr_results = [s for _expr in exprs for s in _expr(df)] + series = align_series_full_broadcast( + *(s.fill_null(0, strategy=None, limit=None) for s in expr_results) + ) + non_na = align_series_full_broadcast(*(1 - s.is_null() for s in expr_results)) + return [reduce(operator.add, series) / reduce(operator.add, non_na)] + + return self._expr._from_callable( + func=func, + depth=max(x._depth for x in exprs) + 1, + function_name="mean_horizontal", + evaluate_output_names=combine_evaluate_output_names(*exprs), + alias_output_names=combine_alias_output_names(*exprs), + context=self, + ) + + def min_horizontal(self, *exprs: PandasLikeExpr) -> PandasLikeExpr: + def func(df: PandasLikeDataFrame) -> list[PandasLikeSeries]: + series = [s for _expr in exprs for s in _expr(df)] + series = align_series_full_broadcast(*series) + + return [ + PandasLikeSeries( + self.concat( + (s.to_frame() for s in series), how="horizontal" + )._native_frame.min(axis=1), + implementation=self._implementation, + backend_version=self._backend_version, + version=self._version, + ).alias(series[0].name) + ] + + return self._expr._from_callable( + func=func, + depth=max(x._depth for x in exprs) + 1, + function_name="min_horizontal", + evaluate_output_names=combine_evaluate_output_names(*exprs), + alias_output_names=combine_alias_output_names(*exprs), + context=self, + ) + + def max_horizontal(self, *exprs: PandasLikeExpr) -> PandasLikeExpr: + def func(df: PandasLikeDataFrame) -> list[PandasLikeSeries]: + series = [s for _expr in exprs for s in _expr(df)] + series = align_series_full_broadcast(*series) + + return [ + PandasLikeSeries( + self.concat( + (s.to_frame() for s in series), how="horizontal" + )._native_frame.max(axis=1), + implementation=self._implementation, + backend_version=self._backend_version, + version=self._version, + ).alias(series[0].name) + ] + + return self._expr._from_callable( + func=func, + depth=max(x._depth for x in exprs) + 1, + function_name="max_horizontal", + evaluate_output_names=combine_evaluate_output_names(*exprs), + alias_output_names=combine_alias_output_names(*exprs), + context=self, + ) + + @property + def _concat(self): # type: ignore[no-untyped-def] # noqa: ANN202 + """Return the **native** equivalent of `pd.concat`.""" + # NOTE: Leave un-annotated to allow `@overload` matching via inference. + if TYPE_CHECKING: + import pandas as pd + + return pd.concat + return self._implementation.to_native_namespace().concat + + def _concat_diagonal(self, dfs: Sequence[pd.DataFrame], /) -> pd.DataFrame: + if self._implementation.is_pandas() and self._backend_version < (3,): + if self._backend_version < (1,): + return self._concat(dfs, axis=VERTICAL, copy=False, sort=False) + return self._concat(dfs, axis=VERTICAL, copy=False) + return self._concat(dfs, axis=VERTICAL) + + def _concat_horizontal(self, dfs: Sequence[NDFrameT], /) -> pd.DataFrame: + if self._implementation.is_cudf(): + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + message="The behavior of array concatenation with empty entries is deprecated", + category=FutureWarning, + ) + return self._concat(dfs, axis=HORIZONTAL) + elif self._implementation.is_pandas() and self._backend_version < (3,): + return self._concat(dfs, axis=HORIZONTAL, copy=False) + return self._concat(dfs, axis=HORIZONTAL) + + def _concat_vertical(self, dfs: Sequence[pd.DataFrame], /) -> pd.DataFrame: + cols_0 = dfs[0].columns + for i, df in enumerate(dfs[1:], start=1): + cols_current = df.columns + if not ( + (len(cols_current) == len(cols_0)) and (cols_current == cols_0).all() + ): + msg = ( + "unable to vstack, column names don't match:\n" + f" - dataframe 0: {cols_0.to_list()}\n" + f" - dataframe {i}: {cols_current.to_list()}\n" + ) + raise TypeError(msg) + if self._implementation.is_pandas() and self._backend_version < (3,): + return self._concat(dfs, axis=VERTICAL, copy=False) + return self._concat(dfs, axis=VERTICAL) + + def when(self, predicate: PandasLikeExpr) -> PandasWhen: + return PandasWhen.from_expr(predicate, context=self) + + def concat_str( + self, *exprs: PandasLikeExpr, separator: str, ignore_nulls: bool + ) -> PandasLikeExpr: + string = self._version.dtypes.String() + + def func(df: PandasLikeDataFrame) -> list[PandasLikeSeries]: + expr_results = [s for _expr in exprs for s in _expr(df)] + series = align_series_full_broadcast(*(s.cast(string) for s in expr_results)) + null_mask = align_series_full_broadcast(*(s.is_null() for s in expr_results)) + + if not ignore_nulls: + null_mask_result = reduce(operator.or_, null_mask) + result = reduce(lambda x, y: x + separator + y, series).zip_with( + ~null_mask_result, None + ) + else: + init_value, *values = [ + s.zip_with(~nm, "") for s, nm in zip(series, null_mask) + ] + + sep_array = init_value.from_iterable( + data=[separator] * len(init_value), + name="sep", + index=init_value.native.index, + context=self, + ) + separators = (sep_array.zip_with(~nm, "") for nm in null_mask[:-1]) + result = reduce( + operator.add, (s + v for s, v in zip(separators, values)), init_value + ) + + return [result] + + return self._expr._from_callable( + func=func, + depth=max(x._depth for x in exprs) + 1, + function_name="concat_str", + evaluate_output_names=combine_evaluate_output_names(*exprs), + alias_output_names=combine_alias_output_names(*exprs), + context=self, + ) + + +class PandasWhen(EagerWhen[PandasLikeDataFrame, PandasLikeSeries, PandasLikeExpr]): + @property + def _then(self) -> type[PandasThen]: + return PandasThen + + def _if_then_else( + self, + when: PandasLikeSeries, + then: PandasLikeSeries, + otherwise: PandasLikeSeries | None, + /, + ) -> PandasLikeSeries: + if otherwise is None: + when, then = align_series_full_broadcast(when, then) + res_native = then.native.where(when.native) + else: + when, then, otherwise = align_series_full_broadcast(when, then, otherwise) + res_native = then.native.where(when.native, otherwise.native) + return then._with_native(res_native) + + +class PandasThen( + CompliantThen[PandasLikeDataFrame, PandasLikeSeries, PandasLikeExpr], PandasLikeExpr +): ... diff --git a/venv/lib/python3.8/site-packages/narwhals/_pandas_like/selectors.py b/venv/lib/python3.8/site-packages/narwhals/_pandas_like/selectors.py new file mode 100644 index 0000000..f6b2a73 --- /dev/null +++ b/venv/lib/python3.8/site-packages/narwhals/_pandas_like/selectors.py @@ -0,0 +1,34 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from narwhals._compliant import CompliantSelector, EagerSelectorNamespace +from narwhals._pandas_like.expr import PandasLikeExpr + +if TYPE_CHECKING: + from narwhals._pandas_like.dataframe import PandasLikeDataFrame # noqa: F401 + from narwhals._pandas_like.series import PandasLikeSeries # noqa: F401 + + +class PandasSelectorNamespace( + EagerSelectorNamespace["PandasLikeDataFrame", "PandasLikeSeries"] +): + @property + def _selector(self) -> type[PandasSelector]: + return PandasSelector + + +class PandasSelector( # type: ignore[misc] + CompliantSelector["PandasLikeDataFrame", "PandasLikeSeries"], PandasLikeExpr +): + def _to_expr(self) -> PandasLikeExpr: + return PandasLikeExpr( + self._call, + depth=self._depth, + function_name=self._function_name, + evaluate_output_names=self._evaluate_output_names, + alias_output_names=self._alias_output_names, + implementation=self._implementation, + backend_version=self._backend_version, + version=self._version, + ) diff --git a/venv/lib/python3.8/site-packages/narwhals/_pandas_like/series.py b/venv/lib/python3.8/site-packages/narwhals/_pandas_like/series.py new file mode 100644 index 0000000..0ea4e83 --- /dev/null +++ b/venv/lib/python3.8/site-packages/narwhals/_pandas_like/series.py @@ -0,0 +1,1109 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Any, Iterable, Iterator, Mapping, Sequence, cast + +import numpy as np + +from narwhals._compliant import EagerSeries +from narwhals._pandas_like.series_cat import PandasLikeSeriesCatNamespace +from narwhals._pandas_like.series_dt import PandasLikeSeriesDateTimeNamespace +from narwhals._pandas_like.series_list import PandasLikeSeriesListNamespace +from narwhals._pandas_like.series_str import PandasLikeSeriesStringNamespace +from narwhals._pandas_like.series_struct import PandasLikeSeriesStructNamespace +from narwhals._pandas_like.utils import ( + align_and_extract_native, + get_dtype_backend, + narwhals_to_native_dtype, + native_to_narwhals_dtype, + object_native_to_narwhals_dtype, + rename, + select_columns_by_name, + set_index, +) +from narwhals._utils import ( + Implementation, + is_list_of, + parse_version, + validate_backend_version, +) +from narwhals.dependencies import is_numpy_array_1d, is_pandas_like_series +from narwhals.exceptions import InvalidOperationError + +if TYPE_CHECKING: + from types import ModuleType + from typing import Hashable + + import pandas as pd + import polars as pl + import pyarrow as pa + from typing_extensions import Self, TypeIs + + from narwhals._arrow.typing import ChunkedArrayAny + from narwhals._pandas_like.dataframe import PandasLikeDataFrame + from narwhals._pandas_like.namespace import PandasLikeNamespace + from narwhals._utils import Version, _FullContext + from narwhals.dtypes import DType + from narwhals.typing import ( + ClosedInterval, + FillNullStrategy, + Into1DArray, + IntoDType, + NonNestedLiteral, + NumericLiteral, + RankMethod, + RollingInterpolationMethod, + SizedMultiIndexSelector, + TemporalLiteral, + _1DArray, + _AnyDArray, + _SliceIndex, + ) + +PANDAS_TO_NUMPY_DTYPE_NO_MISSING = { + "Int64": "int64", + "int64[pyarrow]": "int64", + "Int32": "int32", + "int32[pyarrow]": "int32", + "Int16": "int16", + "int16[pyarrow]": "int16", + "Int8": "int8", + "int8[pyarrow]": "int8", + "UInt64": "uint64", + "uint64[pyarrow]": "uint64", + "UInt32": "uint32", + "uint32[pyarrow]": "uint32", + "UInt16": "uint16", + "uint16[pyarrow]": "uint16", + "UInt8": "uint8", + "uint8[pyarrow]": "uint8", + "Float64": "float64", + "float64[pyarrow]": "float64", + "Float32": "float32", + "float32[pyarrow]": "float32", +} +PANDAS_TO_NUMPY_DTYPE_MISSING = { + "Int64": "float64", + "int64[pyarrow]": "float64", + "Int32": "float64", + "int32[pyarrow]": "float64", + "Int16": "float64", + "int16[pyarrow]": "float64", + "Int8": "float64", + "int8[pyarrow]": "float64", + "UInt64": "float64", + "uint64[pyarrow]": "float64", + "UInt32": "float64", + "uint32[pyarrow]": "float64", + "UInt16": "float64", + "uint16[pyarrow]": "float64", + "UInt8": "float64", + "uint8[pyarrow]": "float64", + "Float64": "float64", + "float64[pyarrow]": "float64", + "Float32": "float32", + "float32[pyarrow]": "float32", +} + + +class PandasLikeSeries(EagerSeries[Any]): + def __init__( + self, + native_series: Any, + *, + implementation: Implementation, + backend_version: tuple[int, ...], + version: Version, + ) -> None: + self._name = native_series.name + self._native_series = native_series + self._implementation = implementation + self._backend_version = backend_version + self._version = version + validate_backend_version(self._implementation, self._backend_version) + # Flag which indicates if, in the final step before applying an operation, + # the single value behind the PandasLikeSeries should be extract and treated + # as a Scalar. For example, in `nw.col('a') - nw.lit(3)`, the latter would + # become a Series of length 1. Rather that doing a full broadcast so it matches + # the length of the whole dataframe, we just extract the scalar. + self._broadcast = False + + @property + def native(self) -> Any: + return self._native_series + + def __native_namespace__(self) -> ModuleType: + if self._implementation.is_pandas_like(): + return self._implementation.to_native_namespace() + + msg = f"Expected pandas/modin/cudf, got: {type(self._implementation)}" # pragma: no cover + raise AssertionError(msg) + + def __narwhals_namespace__(self) -> PandasLikeNamespace: + from narwhals._pandas_like.namespace import PandasLikeNamespace + + return PandasLikeNamespace( + self._implementation, self._backend_version, self._version + ) + + def _gather(self, rows: SizedMultiIndexSelector[pd.Series[Any]]) -> Self: + rows = list(rows) if isinstance(rows, tuple) else rows + return self._with_native(self.native.iloc[rows]) + + def _gather_slice(self, rows: _SliceIndex | range) -> Self: + return self._with_native( + self.native.iloc[slice(rows.start, rows.stop, rows.step)] + ) + + def _with_version(self, version: Version) -> Self: + return self.__class__( + self.native, + implementation=self._implementation, + backend_version=self._backend_version, + version=version, + ) + + def _with_native(self, series: Any, *, preserve_broadcast: bool = False) -> Self: + result = self.__class__( + series, + implementation=self._implementation, + backend_version=self._backend_version, + version=self._version, + ) + if preserve_broadcast: + result._broadcast = self._broadcast + return result + + @classmethod + def from_iterable( + cls, + data: Iterable[Any], + *, + context: _FullContext, + name: str = "", + dtype: IntoDType | None = None, + index: Any = None, + ) -> Self: + implementation = context._implementation + backend_version = context._backend_version + version = context._version + ns = implementation.to_native_namespace() + kwds: dict[str, Any] = {} + if dtype: + kwds["dtype"] = narwhals_to_native_dtype( + dtype, None, implementation, backend_version, version + ) + else: + if implementation.is_pandas(): + kwds["copy"] = False + if index is not None and len(index): + kwds["index"] = index + return cls.from_native(ns.Series(data, name=name, **kwds), context=context) + + @staticmethod + def _is_native(obj: Any) -> TypeIs[Any]: + return is_pandas_like_series(obj) # pragma: no cover + + @classmethod + def from_native(cls, data: Any, /, *, context: _FullContext) -> Self: + return cls( + data, + implementation=context._implementation, + backend_version=context._backend_version, + version=context._version, + ) + + @classmethod + def from_numpy(cls, data: Into1DArray, /, *, context: _FullContext) -> Self: + implementation = context._implementation + arr = data if is_numpy_array_1d(data) else [data] + native = implementation.to_native_namespace().Series(arr, name="") + return cls.from_native(native, context=context) + + @property + def name(self) -> str: + return self._name + + @property + def dtype(self) -> DType: + native_dtype = self.native.dtype + return ( + native_to_narwhals_dtype(native_dtype, self._version, self._implementation) + if native_dtype != "object" + else object_native_to_narwhals_dtype( + self.native, self._version, self._implementation + ) + ) + + def ewm_mean( + self, + *, + com: float | None, + span: float | None, + half_life: float | None, + alpha: float | None, + adjust: bool, + min_samples: int, + ignore_nulls: bool, + ) -> PandasLikeSeries: + ser = self.native + mask_na = ser.isna() + if self._implementation is Implementation.CUDF: + if (min_samples == 0 and not ignore_nulls) or (not mask_na.any()): + result = ser.ewm( + com=com, span=span, halflife=half_life, alpha=alpha, adjust=adjust + ).mean() + else: + msg = ( + "cuDF only supports `ewm_mean` when there are no missing values " + "or when both `min_period=0` and `ignore_nulls=False`" + ) + raise NotImplementedError(msg) + else: + result = ser.ewm( + com, span, half_life, alpha, min_samples, adjust, ignore_na=ignore_nulls + ).mean() + result[mask_na] = None + return self._with_native(result) + + def scatter(self, indices: int | Sequence[int], values: Any) -> Self: + if isinstance(values, self.__class__): + values = set_index( + values.native, + self.native.index[indices], + implementation=self._implementation, + backend_version=self._backend_version, + ) + s = self.native.copy(deep=True) + s.iloc[indices] = values + s.name = self.name + return self._with_native(s) + + def _scatter_in_place(self, indices: Self, values: Self) -> None: + # Scatter, modifying original Series. Use with care! + values_native = set_index( + values.native, + self.native.index[indices.native], + implementation=self._implementation, + backend_version=self._backend_version, + ) + if self._implementation is Implementation.PANDAS and parse_version(np) < (2,): + values_native = values_native.copy() # pragma: no cover + min_pd_version = (1, 2) + if ( + self._implementation is Implementation.PANDAS + and self._backend_version < min_pd_version + ): + self.native.iloc[indices.native.values] = values_native # noqa: PD011 + else: + self.native.iloc[indices.native] = values_native + + def cast(self, dtype: IntoDType) -> Self: + pd_dtype = narwhals_to_native_dtype( + dtype, + dtype_backend=get_dtype_backend(self.native.dtype, self._implementation), + implementation=self._implementation, + backend_version=self._backend_version, + version=self._version, + ) + return self._with_native(self.native.astype(pd_dtype), preserve_broadcast=True) + + def item(self, index: int | None) -> Any: + # cuDF doesn't have Series.item(). + if index is None: + if len(self) != 1: + msg = ( + "can only call '.item()' if the Series is of length 1," + f" or an explicit index is provided (Series is of length {len(self)})" + ) + raise ValueError(msg) + return self.native.iloc[0] + return self.native.iloc[index] + + def to_frame(self) -> PandasLikeDataFrame: + from narwhals._pandas_like.dataframe import PandasLikeDataFrame + + return PandasLikeDataFrame( + self.native.to_frame(), + implementation=self._implementation, + backend_version=self._backend_version, + version=self._version, + validate_column_names=False, + ) + + def to_list(self) -> list[Any]: + is_cudf = self._implementation.is_cudf() + return self.native.to_arrow().to_pylist() if is_cudf else self.native.to_list() + + def is_between( + self, lower_bound: Any, upper_bound: Any, closed: ClosedInterval + ) -> Self: + ser = self.native + _, lower_bound = align_and_extract_native(self, lower_bound) + _, upper_bound = align_and_extract_native(self, upper_bound) + if closed == "left": + res = ser.ge(lower_bound) & ser.lt(upper_bound) + elif closed == "right": + res = ser.gt(lower_bound) & ser.le(upper_bound) + elif closed == "none": + res = ser.gt(lower_bound) & ser.lt(upper_bound) + elif closed == "both": + res = ser.ge(lower_bound) & ser.le(upper_bound) + else: # pragma: no cover + raise AssertionError + return self._with_native(res).alias(ser.name) + + def is_in(self, other: Any) -> PandasLikeSeries: + return self._with_native(self.native.isin(other)) + + def arg_true(self) -> PandasLikeSeries: + ser = self.native + result = ser.__class__(range(len(ser)), name=ser.name, index=ser.index).loc[ser] + return self._with_native(result) + + def arg_min(self) -> int: + if self._implementation is Implementation.PANDAS and self._backend_version < (1,): + return self.native.to_numpy().argmin() + return self.native.argmin() + + def arg_max(self) -> int: + ser = self.native + if self._implementation is Implementation.PANDAS and self._backend_version < (1,): + return ser.to_numpy().argmax() + return ser.argmax() + + # Binary comparisons + + def filter(self, predicate: Any) -> PandasLikeSeries: + if not is_list_of(predicate, bool): + _, other_native = align_and_extract_native(self, predicate) + else: + other_native = predicate + return self._with_native(self.native.loc[other_native]).alias(self.name) + + def __eq__(self, other: object) -> PandasLikeSeries: # type: ignore[override] + ser, other = align_and_extract_native(self, other) + return self._with_native(ser == other).alias(self.name) + + def __ne__(self, other: object) -> PandasLikeSeries: # type: ignore[override] + ser, other = align_and_extract_native(self, other) + return self._with_native(ser != other).alias(self.name) + + def __ge__(self, other: Any) -> PandasLikeSeries: + ser, other = align_and_extract_native(self, other) + return self._with_native(ser >= other).alias(self.name) + + def __gt__(self, other: Any) -> PandasLikeSeries: + ser, other = align_and_extract_native(self, other) + return self._with_native(ser > other).alias(self.name) + + def __le__(self, other: Any) -> PandasLikeSeries: + ser, other = align_and_extract_native(self, other) + return self._with_native(ser <= other).alias(self.name) + + def __lt__(self, other: Any) -> PandasLikeSeries: + ser, other = align_and_extract_native(self, other) + return self._with_native(ser < other).alias(self.name) + + def __and__(self, other: Any) -> PandasLikeSeries: + ser, other = align_and_extract_native(self, other) + return self._with_native(ser & other).alias(self.name) + + def __rand__(self, other: Any) -> PandasLikeSeries: + ser, other = align_and_extract_native(self, other) + ser = cast("pd.Series[Any]", ser) + return self._with_native(ser.__and__(other)).alias(self.name) + + def __or__(self, other: Any) -> PandasLikeSeries: + ser, other = align_and_extract_native(self, other) + return self._with_native(ser | other).alias(self.name) + + def __ror__(self, other: Any) -> PandasLikeSeries: + ser, other = align_and_extract_native(self, other) + ser = cast("pd.Series[Any]", ser) + return self._with_native(ser.__or__(other)).alias(self.name) + + def __add__(self, other: Any) -> PandasLikeSeries: + ser, other = align_and_extract_native(self, other) + return self._with_native(ser + other).alias(self.name) + + def __radd__(self, other: Any) -> PandasLikeSeries: + _, other_native = align_and_extract_native(self, other) + return self._with_native(self.native.__radd__(other_native)).alias(self.name) + + def __sub__(self, other: Any) -> PandasLikeSeries: + ser, other = align_and_extract_native(self, other) + return self._with_native(ser - other).alias(self.name) + + def __rsub__(self, other: Any) -> PandasLikeSeries: + _, other_native = align_and_extract_native(self, other) + return self._with_native(self.native.__rsub__(other_native)).alias(self.name) + + def __mul__(self, other: Any) -> PandasLikeSeries: + ser, other = align_and_extract_native(self, other) + return self._with_native(ser * other).alias(self.name) + + def __rmul__(self, other: Any) -> PandasLikeSeries: + _, other_native = align_and_extract_native(self, other) + return self._with_native(self.native.__rmul__(other_native)).alias(self.name) + + def __truediv__(self, other: Any) -> PandasLikeSeries: + ser, other = align_and_extract_native(self, other) + return self._with_native(ser / other).alias(self.name) + + def __rtruediv__(self, other: Any) -> PandasLikeSeries: + _, other_native = align_and_extract_native(self, other) + return self._with_native(self.native.__rtruediv__(other_native)).alias(self.name) + + def __floordiv__(self, other: Any) -> PandasLikeSeries: + ser, other = align_and_extract_native(self, other) + return self._with_native(ser // other).alias(self.name) + + def __rfloordiv__(self, other: Any) -> PandasLikeSeries: + _, other_native = align_and_extract_native(self, other) + return self._with_native(self.native.__rfloordiv__(other_native)).alias(self.name) + + def __pow__(self, other: Any) -> PandasLikeSeries: + ser, other = align_and_extract_native(self, other) + return self._with_native(ser**other).alias(self.name) + + def __rpow__(self, other: Any) -> PandasLikeSeries: + _, other_native = align_and_extract_native(self, other) + return self._with_native(self.native.__rpow__(other_native)).alias(self.name) + + def __mod__(self, other: Any) -> PandasLikeSeries: + ser, other = align_and_extract_native(self, other) + return self._with_native(ser % other).alias(self.name) + + def __rmod__(self, other: Any) -> PandasLikeSeries: + _, other_native = align_and_extract_native(self, other) + return self._with_native(self.native.__rmod__(other_native)).alias(self.name) + + # Unary + + def __invert__(self: PandasLikeSeries) -> PandasLikeSeries: + return self._with_native(~self.native) + + # Reductions + + def any(self) -> bool: + return self.native.any() + + def all(self) -> bool: + return self.native.all() + + def min(self) -> Any: + return self.native.min() + + def max(self) -> Any: + return self.native.max() + + def sum(self) -> float: + return self.native.sum() + + def count(self) -> int: + return self.native.count() + + def mean(self) -> float: + return self.native.mean() + + def median(self) -> float: + if not self.dtype.is_numeric(): + msg = "`median` operation not supported for non-numeric input type." + raise InvalidOperationError(msg) + return self.native.median() + + def std(self, *, ddof: int) -> float: + return self.native.std(ddof=ddof) + + def var(self, *, ddof: int) -> float: + return self.native.var(ddof=ddof) + + def skew(self) -> float | None: + ser_not_null = self.native.dropna() + if len(ser_not_null) == 0: + return None + elif len(ser_not_null) == 1: + return float("nan") + elif len(ser_not_null) == 2: + return 0.0 + else: + m = ser_not_null - ser_not_null.mean() + m2 = (m**2).mean() + m3 = (m**3).mean() + return m3 / (m2**1.5) if m2 != 0 else float("nan") + + def len(self) -> int: + return len(self.native) + + # Transformations + + def is_null(self) -> PandasLikeSeries: + return self._with_native(self.native.isna(), preserve_broadcast=True) + + def is_nan(self) -> PandasLikeSeries: + ser = self.native + if self.dtype.is_numeric(): + return self._with_native(ser != ser, preserve_broadcast=True) # noqa: PLR0124 + msg = f"`.is_nan` only supported for numeric dtype and not {self.dtype}, did you mean `.is_null`?" + raise InvalidOperationError(msg) + + def fill_null( + self, + value: Self | NonNestedLiteral, + strategy: FillNullStrategy | None, + limit: int | None, + ) -> Self: + ser = self.native + if value is not None: + _, native_value = align_and_extract_native(self, value) + res_ser = self._with_native( + ser.fillna(value=native_value), preserve_broadcast=True + ) + else: + res_ser = self._with_native( + ser.ffill(limit=limit) + if strategy == "forward" + else ser.bfill(limit=limit), + preserve_broadcast=True, + ) + + return res_ser + + def drop_nulls(self) -> PandasLikeSeries: + return self._with_native(self.native.dropna()) + + def n_unique(self) -> int: + return self.native.nunique(dropna=False) + + def sample( + self, + n: int | None, + *, + fraction: float | None, + with_replacement: bool, + seed: int | None, + ) -> Self: + return self._with_native( + self.native.sample( + n=n, frac=fraction, replace=with_replacement, random_state=seed + ) + ) + + def abs(self) -> PandasLikeSeries: + return self._with_native(self.native.abs()) + + def cum_sum(self, *, reverse: bool) -> Self: + result = ( + self.native.cumsum(skipna=True) + if not reverse + else self.native[::-1].cumsum(skipna=True)[::-1] + ) + return self._with_native(result) + + def unique(self, *, maintain_order: bool = True) -> PandasLikeSeries: + """Pandas always maintains order, as per its docstring. + + > Uniques are returned in order of appearance. + """ + return self._with_native( + self.native.__class__(self.native.unique(), name=self.name) + ) + + def diff(self) -> PandasLikeSeries: + return self._with_native(self.native.diff()) + + def shift(self, n: int) -> PandasLikeSeries: + return self._with_native(self.native.shift(n)) + + def replace_strict( + self, + old: Sequence[Any] | Mapping[Any, Any], + new: Sequence[Any], + *, + return_dtype: IntoDType | None, + ) -> PandasLikeSeries: + tmp_name = f"{self.name}_tmp" + dtype_backend = get_dtype_backend(self.native.dtype, self._implementation) + dtype = ( + narwhals_to_native_dtype( + return_dtype, + dtype_backend, + self._implementation, + self._backend_version, + self._version, + ) + if return_dtype + else None + ) + namespace = self.__native_namespace__() + other = namespace.DataFrame( + {self.name: old, tmp_name: namespace.Series(new, dtype=dtype)} + ) + result = self._with_native( + self.native.to_frame().merge(other, on=self.name, how="left")[tmp_name] + ).alias(self.name) + if result.is_null().sum() != self.is_null().sum(): + msg = ( + "replace_strict did not replace all non-null values.\n\n" + f"The following did not get replaced: {self.filter(~self.is_null() & result.is_null()).unique(maintain_order=False).to_list()}" + ) + raise ValueError(msg) + return result + + def sort(self, *, descending: bool, nulls_last: bool) -> PandasLikeSeries: + na_position = "last" if nulls_last else "first" + return self._with_native( + self.native.sort_values(ascending=not descending, na_position=na_position) + ).alias(self.name) + + def alias(self, name: str | Hashable) -> Self: + if name != self.name: + return self._with_native( + rename( + self.native, + name, + implementation=self._implementation, + backend_version=self._backend_version, + ), + preserve_broadcast=True, + ) + return self + + def __array__(self, dtype: Any, *, copy: bool | None) -> _1DArray: + # pandas used to always return object dtype for nullable dtypes. + # So, we intercept __array__ and pass to `to_numpy` ourselves to make + # sure an appropriate numpy dtype is returned. + return self.to_numpy(dtype=dtype, copy=copy) + + def to_numpy(self, dtype: Any = None, *, copy: bool | None = None) -> _1DArray: + # the default is meant to be None, but pandas doesn't allow it? + # https://numpy.org/doc/stable/reference/generated/numpy.ndarray.__array__.html + dtypes = self._version.dtypes + if isinstance(self.dtype, dtypes.Datetime) and self.dtype.time_zone is not None: + s = self.dt.convert_time_zone("UTC").dt.replace_time_zone(None).native + else: + s = self.native + + has_missing = s.isna().any() + kwargs: dict[Any, Any] = {"copy": copy or self._implementation.is_cudf()} + if has_missing and str(s.dtype) in PANDAS_TO_NUMPY_DTYPE_MISSING: + if self._implementation is Implementation.PANDAS and self._backend_version < ( + 1, + ): # pragma: no cover + ... + else: + kwargs.update({"na_value": float("nan")}) + dtype = dtype or PANDAS_TO_NUMPY_DTYPE_MISSING[str(s.dtype)] + if not has_missing and str(s.dtype) in PANDAS_TO_NUMPY_DTYPE_NO_MISSING: + dtype = dtype or PANDAS_TO_NUMPY_DTYPE_NO_MISSING[str(s.dtype)] + return s.to_numpy(dtype=dtype, **kwargs) + + def to_pandas(self) -> pd.Series[Any]: + if self._implementation is Implementation.PANDAS: + return self.native + elif self._implementation is Implementation.CUDF: # pragma: no cover + return self.native.to_pandas() + elif self._implementation is Implementation.MODIN: + return self.native._to_pandas() + msg = f"Unknown implementation: {self._implementation}" # pragma: no cover + raise AssertionError(msg) + + def to_polars(self) -> pl.Series: + import polars as pl # ignore-banned-import + + return pl.from_pandas(self.to_pandas()) + + # --- descriptive --- + def is_unique(self) -> Self: + return self._with_native(~self.native.duplicated(keep=False)).alias(self.name) + + def null_count(self) -> int: + return self.native.isna().sum() + + def is_first_distinct(self) -> Self: + return self._with_native(~self.native.duplicated(keep="first")).alias(self.name) + + def is_last_distinct(self) -> Self: + return self._with_native(~self.native.duplicated(keep="last")).alias(self.name) + + def is_sorted(self, *, descending: bool) -> bool: + if not isinstance(descending, bool): + msg = f"argument 'descending' should be boolean, found {type(descending)}" + raise TypeError(msg) + + if descending: + return self.native.is_monotonic_decreasing + else: + return self.native.is_monotonic_increasing + + def value_counts( + self, *, sort: bool, parallel: bool, name: str | None, normalize: bool + ) -> PandasLikeDataFrame: + """Parallel is unused, exists for compatibility.""" + from narwhals._pandas_like.dataframe import PandasLikeDataFrame + + index_name_ = "index" if self._name is None else self._name + value_name_ = name or ("proportion" if normalize else "count") + val_count = self.native.value_counts( + dropna=False, sort=False, normalize=normalize + ).reset_index() + + val_count.columns = [index_name_, value_name_] + + if sort: + val_count = val_count.sort_values(value_name_, ascending=False) + + return PandasLikeDataFrame.from_native(val_count, context=self) + + def quantile( + self, quantile: float, interpolation: RollingInterpolationMethod + ) -> float: + return self.native.quantile(q=quantile, interpolation=interpolation) + + def zip_with(self, mask: Any, other: Any) -> PandasLikeSeries: + ser = self.native + _, mask = align_and_extract_native(self, mask) + _, other = align_and_extract_native(self, other) + res = ser.where(mask, other) + return self._with_native(res) + + def head(self, n: int) -> Self: + return self._with_native(self.native.head(n)) + + def tail(self, n: int) -> Self: + return self._with_native(self.native.tail(n)) + + def round(self, decimals: int) -> Self: + return self._with_native(self.native.round(decimals=decimals)) + + def to_dummies(self, *, separator: str, drop_first: bool) -> PandasLikeDataFrame: + from narwhals._pandas_like.dataframe import PandasLikeDataFrame + + plx = self.__native_namespace__() + series = self.native + name = str(self._name) if self._name else "" + + null_col_pl = f"{name}{separator}null" + + has_nulls = series.isna().any() + result = plx.get_dummies( + series, + prefix=name, + prefix_sep=separator, + drop_first=drop_first, + # Adds a null column at the end, depending on whether or not there are any. + dummy_na=has_nulls, + dtype="int8", + ) + if has_nulls: + *cols, null_col_pd = list(result.columns) + output_order = [null_col_pd, *cols] + result = rename( + select_columns_by_name( + result, output_order, self._backend_version, self._implementation + ), + columns={null_col_pd: null_col_pl}, + implementation=self._implementation, + backend_version=self._backend_version, + ) + return PandasLikeDataFrame.from_native(result, context=self) + + def gather_every(self, n: int, offset: int) -> Self: + return self._with_native(self.native.iloc[offset::n]) + + def clip( + self, + lower_bound: Self | NumericLiteral | TemporalLiteral | None, + upper_bound: Self | NumericLiteral | TemporalLiteral | None, + ) -> Self: + _, lower = ( + align_and_extract_native(self, lower_bound) if lower_bound else (None, None) + ) + _, upper = ( + align_and_extract_native(self, upper_bound) if upper_bound else (None, None) + ) + kwargs = {"axis": 0} if self._implementation is Implementation.MODIN else {} + return self._with_native(self.native.clip(lower, upper, **kwargs)) + + def to_arrow(self) -> pa.Array[Any]: + if self._implementation is Implementation.CUDF: + return self.native.to_arrow() + + import pyarrow as pa # ignore-banned-import() + + return pa.Array.from_pandas(self.native) + + def mode(self) -> Self: + result = self.native.mode() + result.name = self.name + return self._with_native(result) + + def cum_count(self, *, reverse: bool) -> Self: + not_na_series = ~self.native.isna() + result = ( + not_na_series.cumsum() + if not reverse + else len(self) - not_na_series.cumsum() + not_na_series - 1 + ) + return self._with_native(result) + + def cum_min(self, *, reverse: bool) -> Self: + result = ( + self.native.cummin(skipna=True) + if not reverse + else self.native[::-1].cummin(skipna=True)[::-1] + ) + return self._with_native(result) + + def cum_max(self, *, reverse: bool) -> Self: + result = ( + self.native.cummax(skipna=True) + if not reverse + else self.native[::-1].cummax(skipna=True)[::-1] + ) + return self._with_native(result) + + def cum_prod(self, *, reverse: bool) -> Self: + result = ( + self.native.cumprod(skipna=True) + if not reverse + else self.native[::-1].cumprod(skipna=True)[::-1] + ) + return self._with_native(result) + + def rolling_sum(self, window_size: int, *, min_samples: int, center: bool) -> Self: + result = self.native.rolling( + window=window_size, min_periods=min_samples, center=center + ).sum() + return self._with_native(result) + + def rolling_mean(self, window_size: int, *, min_samples: int, center: bool) -> Self: + result = self.native.rolling( + window=window_size, min_periods=min_samples, center=center + ).mean() + return self._with_native(result) + + def rolling_var( + self, window_size: int, *, min_samples: int, center: bool, ddof: int + ) -> Self: + result = self.native.rolling( + window=window_size, min_periods=min_samples, center=center + ).var(ddof=ddof) + return self._with_native(result) + + def rolling_std( + self, window_size: int, *, min_samples: int, center: bool, ddof: int + ) -> Self: + result = self.native.rolling( + window=window_size, min_periods=min_samples, center=center + ).std(ddof=ddof) + return self._with_native(result) + + def __iter__(self) -> Iterator[Any]: + yield from self.native.__iter__() + + def __contains__(self, other: Any) -> bool: + return self.native.isna().any() if other is None else (self.native == other).any() + + def is_finite(self) -> Self: + s = self.native + return self._with_native((s > float("-inf")) & (s < float("inf"))) + + def rank(self, method: RankMethod, *, descending: bool) -> Self: + pd_method = "first" if method == "ordinal" else method + name = self.name + if ( + self._implementation is Implementation.PANDAS + and self._backend_version < (3,) + and self.dtype.is_integer() + and (null_mask := self.native.isna()).any() + ): + # crazy workaround for the case of `na_option="keep"` and nullable + # integer dtypes. This should be supported in pandas > 3.0 + # https://github.com/pandas-dev/pandas/issues/56976 + ranked_series = ( + self.native.to_frame() + .assign(**{f"{name}_is_null": null_mask}) + .groupby(f"{name}_is_null") + .rank( + method=pd_method, + na_option="keep", + ascending=not descending, + pct=False, + )[name] + ) + else: + ranked_series = self.native.rank( + method=pd_method, na_option="keep", ascending=not descending, pct=False + ) + return self._with_native(ranked_series) + + def hist( # noqa: C901, PLR0912 + self, + bins: list[float | int] | None, + *, + bin_count: int | None, + include_breakpoint: bool, + ) -> PandasLikeDataFrame: + from numpy import linspace, zeros + + from narwhals._pandas_like.dataframe import PandasLikeDataFrame + + ns = self.__native_namespace__() + data: dict[str, Sequence[int | float | str] | _AnyDArray] + + if bin_count == 0 or (bins is not None and len(bins) <= 1): + data = {} + if include_breakpoint: + data["breakpoint"] = [] + data["count"] = [] + return PandasLikeDataFrame.from_native(ns.DataFrame(data), context=self) + + if self.native.count() < 1: + if bins is not None: + data = {"breakpoint": bins[1:], "count": zeros(shape=len(bins) - 1)} + else: + count = cast("int", bin_count) + if bin_count == 1: + data = {"breakpoint": [1.0], "count": [0]} + else: + data = { + "breakpoint": linspace(0, 1, count + 1)[1:], + "count": zeros(shape=count), + } + if not include_breakpoint: + del data["breakpoint"] + return PandasLikeDataFrame.from_native(ns.DataFrame(data), context=self) + + if bin_count is not None: + # use Polars binning behavior + lower, upper = self.native.min(), self.native.max() + if lower == upper: + lower -= 0.5 + upper += 0.5 + + if bin_count == 1: + data = {"breakpoint": [upper], "count": [self.native.count()]} + if not include_breakpoint: + del data["breakpoint"] + return PandasLikeDataFrame.from_native(ns.DataFrame(data), context=self) + + bins = linspace(lower, upper, bin_count + 1) + bin_count = None + + # pandas (2.2.*) .value_counts(bins=int) adjusts the lowest bin twice, result in improper counts. + # pandas (2.2.*) .value_counts(bins=[...]) adjusts the lowest bin which should not happen since + # the bins were explicitly passed in. + categories = ns.cut( + self.native, + bins=bins if bin_count is None else bin_count, + include_lowest=True, # Polars 1.27.0 always includes the lowest bin + ) + # modin (0.32.0) .value_counts(...) silently drops bins with empty observations, .reindex + # is necessary to restore these bins. + result = categories.value_counts(dropna=True, sort=False).reindex( + categories.cat.categories, fill_value=0 + ) + data = {} + if include_breakpoint: + data["breakpoint"] = bins[1:] if bins is not None else result.index.right + data["count"] = result.reset_index(drop=True) + return PandasLikeDataFrame.from_native(ns.DataFrame(data), context=self) + + def log(self, base: float) -> Self: + native = self.native + implementation = self._implementation + + dtype_backend = get_dtype_backend(native.dtype, implementation=implementation) + + if implementation.is_cudf(): + import cupy as cp # ignore-banned-import # cuDF dependency. + + native = self.native + log_arr = cp.log(native) / cp.log(base) + result_native = type(native)(log_arr, index=native.index, name=native.name) + return self._with_native(result_native) + + if dtype_backend == "pyarrow": + import pyarrow.compute as pc + + from narwhals._arrow.utils import native_to_narwhals_dtype + + ca = native.array._pa_array + result_arr = cast("ChunkedArrayAny", pc.logb(ca, base)) + nw_dtype = native_to_narwhals_dtype(result_arr.type, self._version) + out_dtype = narwhals_to_native_dtype( + nw_dtype, + "pyarrow", + self._implementation, + self._backend_version, + self._version, + ) + result_native = native.__class__( + result_arr, dtype=out_dtype, index=native.index, name=native.name + ) + else: + result_native = np.log(native) / np.log(base) + return self._with_native(result_native) + + def exp(self) -> Self: + native = self.native + implementation = self._implementation + + dtype_backend = get_dtype_backend(native.dtype, implementation=implementation) + + if implementation.is_cudf(): + import cupy as cp # ignore-banned-import # cuDF dependency. + + native = self.native + exp_arr = cp.exp(native) + result_native = type(native)(exp_arr, index=native.index, name=native.name) + return self._with_native(result_native) + + if dtype_backend == "pyarrow": + import pyarrow.compute as pc + + from narwhals._arrow.utils import native_to_narwhals_dtype + + ca = native.array._pa_array + result_arr = cast("ChunkedArrayAny", pc.exp(ca)) + nw_dtype = native_to_narwhals_dtype(result_arr.type, self._version) + out_dtype = narwhals_to_native_dtype( + nw_dtype, + "pyarrow", + self._implementation, + self._backend_version, + self._version, + ) + result_native = native.__class__( + result_arr, dtype=out_dtype, index=native.index, name=native.name + ) + else: + result_native = np.exp(native) + return self._with_native(result_native) + + @property + def str(self) -> PandasLikeSeriesStringNamespace: + return PandasLikeSeriesStringNamespace(self) + + @property + def dt(self) -> PandasLikeSeriesDateTimeNamespace: + return PandasLikeSeriesDateTimeNamespace(self) + + @property + def cat(self) -> PandasLikeSeriesCatNamespace: + return PandasLikeSeriesCatNamespace(self) + + @property + def list(self) -> PandasLikeSeriesListNamespace: + if not hasattr(self.native, "list"): + msg = "Series must be of PyArrow List type to support list namespace." + raise TypeError(msg) + return PandasLikeSeriesListNamespace(self) + + @property + def struct(self) -> PandasLikeSeriesStructNamespace: + if not hasattr(self.native, "struct"): + msg = "Series must be of PyArrow Struct type to support struct namespace." + raise TypeError(msg) + return PandasLikeSeriesStructNamespace(self) diff --git a/venv/lib/python3.8/site-packages/narwhals/_pandas_like/series_cat.py b/venv/lib/python3.8/site-packages/narwhals/_pandas_like/series_cat.py new file mode 100644 index 0000000..912da70 --- /dev/null +++ b/venv/lib/python3.8/site-packages/narwhals/_pandas_like/series_cat.py @@ -0,0 +1,17 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from narwhals._compliant.any_namespace import CatNamespace +from narwhals._pandas_like.utils import PandasLikeSeriesNamespace + +if TYPE_CHECKING: + from narwhals._pandas_like.series import PandasLikeSeries + + +class PandasLikeSeriesCatNamespace( + PandasLikeSeriesNamespace, CatNamespace["PandasLikeSeries"] +): + def get_categories(self) -> PandasLikeSeries: + s = self.native + return self.with_native(type(s)(s.cat.categories, name=s.name)) diff --git a/venv/lib/python3.8/site-packages/narwhals/_pandas_like/series_dt.py b/venv/lib/python3.8/site-packages/narwhals/_pandas_like/series_dt.py new file mode 100644 index 0000000..c8083e9 --- /dev/null +++ b/venv/lib/python3.8/site-packages/narwhals/_pandas_like/series_dt.py @@ -0,0 +1,237 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Any + +from narwhals._compliant.any_namespace import DateTimeNamespace +from narwhals._duration import parse_interval_string +from narwhals._pandas_like.utils import ( + UNIT_DICT, + PandasLikeSeriesNamespace, + calculate_timestamp_date, + calculate_timestamp_datetime, + get_dtype_backend, + int_dtype_mapper, + is_pyarrow_dtype_backend, +) + +if TYPE_CHECKING: + from narwhals._pandas_like.series import PandasLikeSeries + from narwhals.typing import TimeUnit + + +class PandasLikeSeriesDateTimeNamespace( + PandasLikeSeriesNamespace, DateTimeNamespace["PandasLikeSeries"] +): + def date(self) -> PandasLikeSeries: + result = self.with_native(self.native.dt.date) + if str(result.dtype).lower() == "object": + msg = ( + "Accessing `date` on the default pandas backend " + "will return a Series of type `object`." + "\nThis differs from polars API and will prevent `.dt` chaining. " + "Please switch to the `pyarrow` backend:" + '\ndf.convert_dtypes(dtype_backend="pyarrow")' + ) + raise NotImplementedError(msg) + return result + + def year(self) -> PandasLikeSeries: + return self.with_native(self.native.dt.year) + + def month(self) -> PandasLikeSeries: + return self.with_native(self.native.dt.month) + + def day(self) -> PandasLikeSeries: + return self.with_native(self.native.dt.day) + + def hour(self) -> PandasLikeSeries: + return self.with_native(self.native.dt.hour) + + def minute(self) -> PandasLikeSeries: + return self.with_native(self.native.dt.minute) + + def second(self) -> PandasLikeSeries: + return self.with_native(self.native.dt.second) + + def millisecond(self) -> PandasLikeSeries: + return self.microsecond() // 1000 + + def microsecond(self) -> PandasLikeSeries: + if self.backend_version < (3, 0, 0) and self._is_pyarrow(): + # crazy workaround for https://github.com/pandas-dev/pandas/issues/59154 + import pyarrow.compute as pc # ignore-banned-import() + + from narwhals._arrow.utils import lit + + arr_ns = self.native.array + arr = arr_ns.__arrow_array__() + result_arr = pc.add( + pc.multiply(pc.millisecond(arr), lit(1_000)), pc.microsecond(arr) + ) + result = type(self.native)(type(arr_ns)(result_arr), name=self.native.name) + return self.with_native(result) + + return self.with_native(self.native.dt.microsecond) + + def nanosecond(self) -> PandasLikeSeries: + return self.microsecond() * 1_000 + self.native.dt.nanosecond + + def ordinal_day(self) -> PandasLikeSeries: + year_start = self.native.dt.year + result = ( + self.native.to_numpy().astype("datetime64[D]") + - (year_start.to_numpy() - 1970).astype("datetime64[Y]") + ).astype("int32") + 1 + dtype = "Int64[pyarrow]" if self._is_pyarrow() else "int32" + return self.with_native( + type(self.native)(result, dtype=dtype, name=year_start.name) + ) + + def weekday(self) -> PandasLikeSeries: + # Pandas is 0-6 while Polars is 1-7 + return self.with_native(self.native.dt.weekday) + 1 + + def _is_pyarrow(self) -> bool: + return is_pyarrow_dtype_backend(self.native.dtype, self.implementation) + + def _get_total_seconds(self) -> Any: + if hasattr(self.native.dt, "total_seconds"): + return self.native.dt.total_seconds() + else: # pragma: no cover + return ( + self.native.dt.days * 86400 + + self.native.dt.seconds + + (self.native.dt.microseconds / 1e6) + + (self.native.dt.nanoseconds / 1e9) + ) + + def total_minutes(self) -> PandasLikeSeries: + s = self._get_total_seconds() + # this calculates the sign of each series element + s_sign = 2 * (s > 0).astype(int_dtype_mapper(s.dtype)) - 1 + s_abs = s.abs() // 60 + if ~s.isna().any(): + s_abs = s_abs.astype(int_dtype_mapper(s.dtype)) + return self.with_native(s_abs * s_sign) + + def total_seconds(self) -> PandasLikeSeries: + s = self._get_total_seconds() + # this calculates the sign of each series element + s_sign = 2 * (s > 0).astype(int_dtype_mapper(s.dtype)) - 1 + s_abs = s.abs() // 1 + if ~s.isna().any(): + s_abs = s_abs.astype(int_dtype_mapper(s.dtype)) + return self.with_native(s_abs * s_sign) + + def total_milliseconds(self) -> PandasLikeSeries: + s = self._get_total_seconds() * 1e3 + # this calculates the sign of each series element + s_sign = 2 * (s > 0).astype(int_dtype_mapper(s.dtype)) - 1 + s_abs = s.abs() // 1 + if ~s.isna().any(): + s_abs = s_abs.astype(int_dtype_mapper(s.dtype)) + return self.with_native(s_abs * s_sign) + + def total_microseconds(self) -> PandasLikeSeries: + s = self._get_total_seconds() * 1e6 + # this calculates the sign of each series element + s_sign = 2 * (s > 0).astype(int_dtype_mapper(s.dtype)) - 1 + s_abs = s.abs() // 1 + if ~s.isna().any(): + s_abs = s_abs.astype(int_dtype_mapper(s.dtype)) + return self.with_native(s_abs * s_sign) + + def total_nanoseconds(self) -> PandasLikeSeries: + s = self._get_total_seconds() * 1e9 + # this calculates the sign of each series element + s_sign = 2 * (s > 0).astype(int_dtype_mapper(s.dtype)) - 1 + s_abs = s.abs() // 1 + if ~s.isna().any(): + s_abs = s_abs.astype(int_dtype_mapper(s.dtype)) + return self.with_native(s_abs * s_sign) + + def to_string(self, format: str) -> PandasLikeSeries: + # Polars' parser treats `'%.f'` as pandas does `'.%f'` + # PyArrow interprets `'%S'` as "seconds, plus fractional seconds" + # and doesn't support `%f` + if not self._is_pyarrow(): + format = format.replace("%S%.f", "%S.%f") + else: + format = format.replace("%S.%f", "%S").replace("%S%.f", "%S") + return self.with_native(self.native.dt.strftime(format)) + + def replace_time_zone(self, time_zone: str | None) -> PandasLikeSeries: + de_zone = self.native.dt.tz_localize(None) + result = de_zone.dt.tz_localize(time_zone) if time_zone is not None else de_zone + return self.with_native(result) + + def convert_time_zone(self, time_zone: str) -> PandasLikeSeries: + if self.compliant.dtype.time_zone is None: # type: ignore[attr-defined] + result = self.native.dt.tz_localize("UTC").dt.tz_convert(time_zone) + else: + result = self.native.dt.tz_convert(time_zone) + return self.with_native(result) + + def timestamp(self, time_unit: TimeUnit) -> PandasLikeSeries: + s = self.native + dtype = self.compliant.dtype + mask_na = s.isna() + dtypes = self.version.dtypes + if dtype == dtypes.Date: + # Date is only supported in pandas dtypes if pyarrow-backed + s_cast = s.astype("Int32[pyarrow]") + result = calculate_timestamp_date(s_cast, time_unit) + elif isinstance(dtype, dtypes.Datetime): + fn = ( + s.view + if (self.implementation.is_pandas() and self.backend_version < (2,)) + else s.astype + ) + s_cast = fn("Int64[pyarrow]") if self._is_pyarrow() else fn("int64") + result = calculate_timestamp_datetime(s_cast, dtype.time_unit, time_unit) + else: + msg = "Input should be either of Date or Datetime type" + raise TypeError(msg) + result[mask_na] = None + return self.with_native(result) + + def truncate(self, every: str) -> PandasLikeSeries: + multiple, unit = parse_interval_string(every) + native = self.native + if self.implementation.is_cudf(): + if multiple != 1: + msg = f"Only multiple `1` is supported for cuDF, got: {multiple}." + raise NotImplementedError(msg) + return self.with_native(self.native.dt.floor(UNIT_DICT.get(unit, unit))) + dtype_backend = get_dtype_backend(native.dtype, self.compliant._implementation) + if unit in {"mo", "q", "y"}: + if self.implementation.is_cudf(): + msg = f"Truncating to {unit} is not supported yet for cuDF." + raise NotImplementedError(msg) + if dtype_backend == "pyarrow": + import pyarrow.compute as pc # ignore-banned-import + + from narwhals._arrow.utils import UNITS_DICT + + ca = native.array._pa_array + result_arr = pc.floor_temporal(ca, multiple, UNITS_DICT[unit]) + else: + if unit == "q": + multiple *= 3 + np_unit = "M" + elif unit == "mo": + np_unit = "M" + else: + np_unit = "Y" + arr = native.values + arr_dtype = arr.dtype + result_arr = arr.astype(f"datetime64[{multiple}{np_unit}]").astype( + arr_dtype + ) + result_native = native.__class__( + result_arr, dtype=native.dtype, index=native.index, name=native.name + ) + return self.with_native(result_native) + return self.with_native( + self.native.dt.floor(f"{multiple}{UNIT_DICT.get(unit, unit)}") + ) diff --git a/venv/lib/python3.8/site-packages/narwhals/_pandas_like/series_list.py b/venv/lib/python3.8/site-packages/narwhals/_pandas_like/series_list.py new file mode 100644 index 0000000..7816c1b --- /dev/null +++ b/venv/lib/python3.8/site-packages/narwhals/_pandas_like/series_list.py @@ -0,0 +1,33 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from narwhals._compliant.any_namespace import ListNamespace +from narwhals._pandas_like.utils import ( + PandasLikeSeriesNamespace, + get_dtype_backend, + narwhals_to_native_dtype, +) + +if TYPE_CHECKING: + from narwhals._pandas_like.series import PandasLikeSeries + + +class PandasLikeSeriesListNamespace( + PandasLikeSeriesNamespace, ListNamespace["PandasLikeSeries"] +): + def len(self) -> PandasLikeSeries: + result = self.native.list.len() + implementation = self.implementation + backend_version = self.backend_version + if implementation.is_pandas() and backend_version < (3, 0): # pragma: no cover + # `result` is a new object so it's safe to do this inplace. + result.index = self.native.index + dtype = narwhals_to_native_dtype( + self.version.dtypes.UInt32(), + get_dtype_backend(result.dtype, implementation), + implementation, + backend_version, + self.version, + ) + return self.with_native(result.astype(dtype)).alias(self.native.name) diff --git a/venv/lib/python3.8/site-packages/narwhals/_pandas_like/series_str.py b/venv/lib/python3.8/site-packages/narwhals/_pandas_like/series_str.py new file mode 100644 index 0000000..c4bef09 --- /dev/null +++ b/venv/lib/python3.8/site-packages/narwhals/_pandas_like/series_str.py @@ -0,0 +1,79 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Any + +from narwhals._compliant.any_namespace import StringNamespace +from narwhals._pandas_like.utils import ( + PandasLikeSeriesNamespace, + is_pyarrow_dtype_backend, +) + +if TYPE_CHECKING: + from narwhals._pandas_like.series import PandasLikeSeries + + +class PandasLikeSeriesStringNamespace( + PandasLikeSeriesNamespace, StringNamespace["PandasLikeSeries"] +): + def len_chars(self) -> PandasLikeSeries: + return self.with_native(self.native.str.len()) + + def replace( + self, pattern: str, value: str, *, literal: bool, n: int + ) -> PandasLikeSeries: + return self.with_native( + self.native.str.replace(pat=pattern, repl=value, n=n, regex=not literal) + ) + + def replace_all(self, pattern: str, value: str, *, literal: bool) -> PandasLikeSeries: + return self.replace(pattern, value, literal=literal, n=-1) + + def strip_chars(self, characters: str | None) -> PandasLikeSeries: + return self.with_native(self.native.str.strip(characters)) + + def starts_with(self, prefix: str) -> PandasLikeSeries: + return self.with_native(self.native.str.startswith(prefix)) + + def ends_with(self, suffix: str) -> PandasLikeSeries: + return self.with_native(self.native.str.endswith(suffix)) + + def contains(self, pattern: str, *, literal: bool) -> PandasLikeSeries: + return self.with_native(self.native.str.contains(pat=pattern, regex=not literal)) + + def slice(self, offset: int, length: int | None) -> PandasLikeSeries: + stop = offset + length if length else None + return self.with_native(self.native.str.slice(start=offset, stop=stop)) + + def split(self, by: str) -> PandasLikeSeries: + implementation = self.implementation + if not implementation.is_cudf() and not is_pyarrow_dtype_backend( + self.native.dtype, implementation + ): + msg = ( + "This operation requires a pyarrow-backed series. " + "Please refer to https://narwhals-dev.github.io/narwhals/api-reference/narwhals/#narwhals.maybe_convert_dtypes " + "and ensure you are using dtype_backend='pyarrow'. " + "Additionally, make sure you have pandas version 1.5+ and pyarrow installed. " + ) + raise TypeError(msg) + return self.with_native(self.native.str.split(pat=by)) + + def to_datetime(self, format: str | None) -> PandasLikeSeries: + # If we know inputs are timezone-aware, we can pass `utc=True` for better performance. + if format and any(x in format for x in ("%z", "Z")): + return self.with_native(self._to_datetime(format, utc=True)) + result = self.with_native(self._to_datetime(format, utc=False)) + if (tz := getattr(result.dtype, "time_zone", None)) and tz != "UTC": + return result.dt.convert_time_zone("UTC") + return result + + def _to_datetime(self, format: str | None, *, utc: bool) -> Any: + return self.implementation.to_native_namespace().to_datetime( + self.native, format=format, utc=utc + ) + + def to_uppercase(self) -> PandasLikeSeries: + return self.with_native(self.native.str.upper()) + + def to_lowercase(self) -> PandasLikeSeries: + return self.with_native(self.native.str.lower()) diff --git a/venv/lib/python3.8/site-packages/narwhals/_pandas_like/series_struct.py b/venv/lib/python3.8/site-packages/narwhals/_pandas_like/series_struct.py new file mode 100644 index 0000000..dc80997 --- /dev/null +++ b/venv/lib/python3.8/site-packages/narwhals/_pandas_like/series_struct.py @@ -0,0 +1,16 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from narwhals._compliant.any_namespace import StructNamespace +from narwhals._pandas_like.utils import PandasLikeSeriesNamespace + +if TYPE_CHECKING: + from narwhals._pandas_like.series import PandasLikeSeries + + +class PandasLikeSeriesStructNamespace( + PandasLikeSeriesNamespace, StructNamespace["PandasLikeSeries"] +): + def field(self, name: str) -> PandasLikeSeries: + return self.with_native(self.native.struct.field(name)).alias(name) diff --git a/venv/lib/python3.8/site-packages/narwhals/_pandas_like/typing.py b/venv/lib/python3.8/site-packages/narwhals/_pandas_like/typing.py new file mode 100644 index 0000000..6f7bcb2 --- /dev/null +++ b/venv/lib/python3.8/site-packages/narwhals/_pandas_like/typing.py @@ -0,0 +1,15 @@ +from __future__ import annotations # pragma: no cover + +from typing import TYPE_CHECKING # pragma: no cover + +if TYPE_CHECKING: + from typing import Any, TypeVar + + import pandas as pd + from typing_extensions import TypeAlias + + from narwhals._pandas_like.expr import PandasLikeExpr + from narwhals._pandas_like.series import PandasLikeSeries + + IntoPandasLikeExpr: TypeAlias = "PandasLikeExpr | PandasLikeSeries" + NDFrameT = TypeVar("NDFrameT", "pd.DataFrame", "pd.Series[Any]") diff --git a/venv/lib/python3.8/site-packages/narwhals/_pandas_like/utils.py b/venv/lib/python3.8/site-packages/narwhals/_pandas_like/utils.py new file mode 100644 index 0000000..bc75c14 --- /dev/null +++ b/venv/lib/python3.8/site-packages/narwhals/_pandas_like/utils.py @@ -0,0 +1,673 @@ +from __future__ import annotations + +import functools +import re +from contextlib import suppress +from typing import TYPE_CHECKING, Any, Callable, Literal, Sized, TypeVar + +import pandas as pd + +from narwhals._compliant.series import EagerSeriesNamespace +from narwhals._utils import ( + Implementation, + Version, + _DeferredIterable, + check_columns_exist, + isinstance_or_issubclass, +) +from narwhals.exceptions import DuplicateError, ShapeError + +T = TypeVar("T", bound=Sized) + +if TYPE_CHECKING: + from pandas._typing import Dtype as PandasDtype + + from narwhals._pandas_like.expr import PandasLikeExpr + from narwhals._pandas_like.series import PandasLikeSeries + from narwhals.dtypes import DType + from narwhals.typing import DTypeBackend, IntoDType, TimeUnit, _1DArray + + ExprT = TypeVar("ExprT", bound=PandasLikeExpr) + + +PANDAS_LIKE_IMPLEMENTATION = { + Implementation.PANDAS, + Implementation.CUDF, + Implementation.MODIN, +} +PD_DATETIME_RGX = r"""^ + datetime64\[ + (?P<time_unit>s|ms|us|ns) # Match time unit: s, ms, us, or ns + (?:, # Begin non-capturing group for optional timezone + \s* # Optional whitespace after comma + (?P<time_zone> # Start named group for timezone + [a-zA-Z\/]+ # Match timezone name, e.g., UTC, America/New_York + (?:[+-]\d{2}:\d{2})? # Optional offset in format +HH:MM or -HH:MM + | # OR + pytz\.FixedOffset\(\d+\) # Match pytz.FixedOffset with integer offset in parentheses + ) # End time_zone group + )? # End optional timezone group + \] # Closing bracket for datetime64 +$""" +PATTERN_PD_DATETIME = re.compile(PD_DATETIME_RGX, re.VERBOSE) +PA_DATETIME_RGX = r"""^ + timestamp\[ + (?P<time_unit>s|ms|us|ns) # Match time unit: s, ms, us, or ns + (?:, # Begin non-capturing group for optional timezone + \s?tz= # Match "tz=" prefix + (?P<time_zone> # Start named group for timezone + [a-zA-Z\/]* # Match timezone name (e.g., UTC, America/New_York) + (?: # Begin optional non-capturing group for offset + [+-]\d{2}:\d{2} # Match offset in format +HH:MM or -HH:MM + )? # End optional offset group + ) # End time_zone group + )? # End optional timezone group + \] # Closing bracket for timestamp + \[pyarrow\] # Literal string "[pyarrow]" +$""" +PATTERN_PA_DATETIME = re.compile(PA_DATETIME_RGX, re.VERBOSE) +PD_DURATION_RGX = r"""^ + timedelta64\[ + (?P<time_unit>s|ms|us|ns) # Match time unit: s, ms, us, or ns + \] # Closing bracket for timedelta64 +$""" + +PATTERN_PD_DURATION = re.compile(PD_DURATION_RGX, re.VERBOSE) +PA_DURATION_RGX = r"""^ + duration\[ + (?P<time_unit>s|ms|us|ns) # Match time unit: s, ms, us, or ns + \] # Closing bracket for duration + \[pyarrow\] # Literal string "[pyarrow]" +$""" +PATTERN_PA_DURATION = re.compile(PA_DURATION_RGX, re.VERBOSE) + +UNIT_DICT = {"d": "D", "m": "min"} + + +def align_and_extract_native( + lhs: PandasLikeSeries, rhs: PandasLikeSeries | object +) -> tuple[pd.Series[Any] | object, pd.Series[Any] | object]: + """Validate RHS of binary operation. + + If the comparison isn't supported, return `NotImplemented` so that the + "right-hand-side" operation (e.g. `__radd__`) can be tried. + """ + from narwhals._pandas_like.dataframe import PandasLikeDataFrame + from narwhals._pandas_like.series import PandasLikeSeries + + lhs_index = lhs.native.index + + if isinstance(rhs, PandasLikeDataFrame): + return NotImplemented + + if lhs._broadcast and isinstance(rhs, PandasLikeSeries) and not rhs._broadcast: + return lhs.native.iloc[0], rhs.native + + if isinstance(rhs, PandasLikeSeries): + if rhs._broadcast: + return (lhs.native, rhs.native.iloc[0]) + if rhs.native.index is not lhs_index: + return ( + lhs.native, + set_index( + rhs.native, + lhs_index, + implementation=rhs._implementation, + backend_version=rhs._backend_version, + ), + ) + return (lhs.native, rhs.native) + + if isinstance(rhs, list): + msg = "Expected Series or scalar, got list." + raise TypeError(msg) + # `rhs` must be scalar, so just leave it as-is + return lhs.native, rhs + + +def set_index( + obj: T, + index: Any, + *, + implementation: Implementation, + backend_version: tuple[int, ...], +) -> T: + """Wrapper around pandas' set_axis to set object index. + + We can set `copy` / `inplace` based on implementation/version. + """ + if isinstance(index, implementation.to_native_namespace().Index) and ( + expected_len := len(index) + ) != (actual_len := len(obj)): + msg = f"Expected object of length {expected_len}, got length: {actual_len}" + raise ShapeError(msg) + if implementation is Implementation.CUDF: # pragma: no cover + obj = obj.copy(deep=False) # type: ignore[attr-defined] + obj.index = index # type: ignore[attr-defined] + return obj + if implementation is Implementation.PANDAS and ( + backend_version < (1,) + ): # pragma: no cover + kwargs = {"inplace": False} + else: + kwargs = {} + if implementation is Implementation.PANDAS and ( + (1, 5) <= backend_version < (3,) + ): # pragma: no cover + kwargs["copy"] = False + else: # pragma: no cover + pass + return obj.set_axis(index, axis=0, **kwargs) # type: ignore[attr-defined] + + +def rename( + obj: T, + *args: Any, + implementation: Implementation, + backend_version: tuple[int, ...], + **kwargs: Any, +) -> T: + """Wrapper around pandas' rename so that we can set `copy` based on implementation/version.""" + if implementation is Implementation.PANDAS and ( + backend_version >= (3,) + ): # pragma: no cover + return obj.rename(*args, **kwargs) # type: ignore[attr-defined] + return obj.rename(*args, **kwargs, copy=False) # type: ignore[attr-defined] + + +@functools.lru_cache(maxsize=16) +def non_object_native_to_narwhals_dtype(native_dtype: Any, version: Version) -> DType: # noqa: C901, PLR0912 + dtype = str(native_dtype) + + dtypes = version.dtypes + if dtype in {"int64", "Int64", "Int64[pyarrow]", "int64[pyarrow]"}: + return dtypes.Int64() + if dtype in {"int32", "Int32", "Int32[pyarrow]", "int32[pyarrow]"}: + return dtypes.Int32() + if dtype in {"int16", "Int16", "Int16[pyarrow]", "int16[pyarrow]"}: + return dtypes.Int16() + if dtype in {"int8", "Int8", "Int8[pyarrow]", "int8[pyarrow]"}: + return dtypes.Int8() + if dtype in {"uint64", "UInt64", "UInt64[pyarrow]", "uint64[pyarrow]"}: + return dtypes.UInt64() + if dtype in {"uint32", "UInt32", "UInt32[pyarrow]", "uint32[pyarrow]"}: + return dtypes.UInt32() + if dtype in {"uint16", "UInt16", "UInt16[pyarrow]", "uint16[pyarrow]"}: + return dtypes.UInt16() + if dtype in {"uint8", "UInt8", "UInt8[pyarrow]", "uint8[pyarrow]"}: + return dtypes.UInt8() + if dtype in { + "float64", + "Float64", + "Float64[pyarrow]", + "float64[pyarrow]", + "double[pyarrow]", + }: + return dtypes.Float64() + if dtype in { + "float32", + "Float32", + "Float32[pyarrow]", + "float32[pyarrow]", + "float[pyarrow]", + }: + return dtypes.Float32() + if dtype in {"string", "string[python]", "string[pyarrow]", "large_string[pyarrow]"}: + return dtypes.String() + if dtype in {"bool", "boolean", "boolean[pyarrow]", "bool[pyarrow]"}: + return dtypes.Boolean() + if dtype.startswith("dictionary<"): + return dtypes.Categorical() + if dtype == "category": + return native_categorical_to_narwhals_dtype(native_dtype, version) + if (match_ := PATTERN_PD_DATETIME.match(dtype)) or ( + match_ := PATTERN_PA_DATETIME.match(dtype) + ): + dt_time_unit: TimeUnit = match_.group("time_unit") # type: ignore[assignment] + dt_time_zone: str | None = match_.group("time_zone") + return dtypes.Datetime(dt_time_unit, dt_time_zone) + if (match_ := PATTERN_PD_DURATION.match(dtype)) or ( + match_ := PATTERN_PA_DURATION.match(dtype) + ): + du_time_unit: TimeUnit = match_.group("time_unit") # type: ignore[assignment] + return dtypes.Duration(du_time_unit) + if dtype == "date32[day][pyarrow]": + return dtypes.Date() + if dtype.startswith("decimal") and dtype.endswith("[pyarrow]"): + return dtypes.Decimal() + if dtype.startswith("time") and dtype.endswith("[pyarrow]"): + return dtypes.Time() + if dtype.startswith("binary") and dtype.endswith("[pyarrow]"): + return dtypes.Binary() + return dtypes.Unknown() # pragma: no cover + + +def object_native_to_narwhals_dtype( + series: PandasLikeSeries, version: Version, implementation: Implementation +) -> DType: + dtypes = version.dtypes + if implementation is Implementation.CUDF: # pragma: no cover + # Per conversations with their maintainers, they don't support arbitrary + # objects, so we can just return String. + return dtypes.String() + + # Arbitrary limit of 100 elements to use to sniff dtype. + inferred_dtype = pd.api.types.infer_dtype(series.head(100), skipna=True) + if inferred_dtype == "string": + return dtypes.String() + if inferred_dtype == "empty" and version is not Version.V1: + # Default to String for empty Series. + return dtypes.String() + elif inferred_dtype == "empty": + # But preserve returning Object in V1. + return dtypes.Object() + return dtypes.Object() + + +def native_categorical_to_narwhals_dtype( + native_dtype: pd.CategoricalDtype, + version: Version, + implementation: Literal[Implementation.CUDF] | None = None, +) -> DType: + dtypes = version.dtypes + if version is Version.V1: + return dtypes.Categorical() + if native_dtype.ordered: + into_iter = ( + _cudf_categorical_to_list(native_dtype) + if implementation is Implementation.CUDF + else native_dtype.categories.to_list + ) + return dtypes.Enum(_DeferredIterable(into_iter)) + return dtypes.Categorical() + + +def _cudf_categorical_to_list( + native_dtype: Any, +) -> Callable[[], list[Any]]: # pragma: no cover + # NOTE: https://docs.rapids.ai/api/cudf/stable/user_guide/api_docs/api/cudf.core.dtypes.categoricaldtype/#cudf.core.dtypes.CategoricalDtype + def fn() -> list[Any]: + return native_dtype.categories.to_arrow().to_pylist() + + return fn + + +def native_to_narwhals_dtype( + native_dtype: Any, version: Version, implementation: Implementation +) -> DType: + str_dtype = str(native_dtype) + + if str_dtype.startswith(("large_list", "list", "struct", "fixed_size_list")): + from narwhals._arrow.utils import ( + native_to_narwhals_dtype as arrow_native_to_narwhals_dtype, + ) + + if hasattr(native_dtype, "to_arrow"): # pragma: no cover + # cudf, cudf.pandas + return arrow_native_to_narwhals_dtype(native_dtype.to_arrow(), version) + return arrow_native_to_narwhals_dtype(native_dtype.pyarrow_dtype, version) + if str_dtype == "category" and implementation.is_cudf(): + # https://github.com/rapidsai/cudf/issues/18536 + # https://github.com/rapidsai/cudf/issues/14027 + return native_categorical_to_narwhals_dtype( + native_dtype, version, Implementation.CUDF + ) + if str_dtype != "object": + return non_object_native_to_narwhals_dtype(native_dtype, version) + elif implementation is Implementation.DASK: + # Per conversations with their maintainers, they don't support arbitrary + # objects, so we can just return String. + return version.dtypes.String() + msg = ( + "Unreachable code, object dtype should be handled separately" # pragma: no cover + ) + raise AssertionError(msg) + + +def get_dtype_backend(dtype: Any, implementation: Implementation) -> DTypeBackend: + """Get dtype backend for pandas type. + + Matches pandas' `dtype_backend` argument in `convert_dtypes`. + """ + if implementation is Implementation.CUDF: + return None + if hasattr(pd, "ArrowDtype") and isinstance(dtype, pd.ArrowDtype): + return "pyarrow" + with suppress(AttributeError): + sentinel = object() + if ( + isinstance(dtype, pd.api.extensions.ExtensionDtype) + and getattr(dtype, "base", sentinel) is None + ): + return "numpy_nullable" + return None + + +@functools.lru_cache(maxsize=16) +def is_pyarrow_dtype_backend(dtype: Any, implementation: Implementation) -> bool: + return get_dtype_backend(dtype, implementation) == "pyarrow" + + +def narwhals_to_native_dtype( # noqa: C901, PLR0912, PLR0915 + dtype: IntoDType, + dtype_backend: DTypeBackend, + implementation: Implementation, + backend_version: tuple[int, ...], + version: Version, +) -> str | PandasDtype: + if dtype_backend is not None and dtype_backend not in {"pyarrow", "numpy_nullable"}: + msg = f"Expected one of {{None, 'pyarrow', 'numpy_nullable'}}, got: '{dtype_backend}'" + raise ValueError(msg) + dtypes = version.dtypes + if isinstance_or_issubclass(dtype, dtypes.Decimal): + msg = "Casting to Decimal is not supported yet." + raise NotImplementedError(msg) + if isinstance_or_issubclass(dtype, dtypes.Float64): + if dtype_backend == "pyarrow": + return "Float64[pyarrow]" + elif dtype_backend == "numpy_nullable": + return "Float64" + return "float64" + if isinstance_or_issubclass(dtype, dtypes.Float32): + if dtype_backend == "pyarrow": + return "Float32[pyarrow]" + elif dtype_backend == "numpy_nullable": + return "Float32" + return "float32" + if isinstance_or_issubclass(dtype, dtypes.Int64): + if dtype_backend == "pyarrow": + return "Int64[pyarrow]" + elif dtype_backend == "numpy_nullable": + return "Int64" + return "int64" + if isinstance_or_issubclass(dtype, dtypes.Int32): + if dtype_backend == "pyarrow": + return "Int32[pyarrow]" + elif dtype_backend == "numpy_nullable": + return "Int32" + return "int32" + if isinstance_or_issubclass(dtype, dtypes.Int16): + if dtype_backend == "pyarrow": + return "Int16[pyarrow]" + elif dtype_backend == "numpy_nullable": + return "Int16" + return "int16" + if isinstance_or_issubclass(dtype, dtypes.Int8): + if dtype_backend == "pyarrow": + return "Int8[pyarrow]" + elif dtype_backend == "numpy_nullable": + return "Int8" + return "int8" + if isinstance_or_issubclass(dtype, dtypes.UInt64): + if dtype_backend == "pyarrow": + return "UInt64[pyarrow]" + elif dtype_backend == "numpy_nullable": + return "UInt64" + return "uint64" + if isinstance_or_issubclass(dtype, dtypes.UInt32): + if dtype_backend == "pyarrow": + return "UInt32[pyarrow]" + elif dtype_backend == "numpy_nullable": + return "UInt32" + return "uint32" + if isinstance_or_issubclass(dtype, dtypes.UInt16): + if dtype_backend == "pyarrow": + return "UInt16[pyarrow]" + elif dtype_backend == "numpy_nullable": + return "UInt16" + return "uint16" + if isinstance_or_issubclass(dtype, dtypes.UInt8): + if dtype_backend == "pyarrow": + return "UInt8[pyarrow]" + elif dtype_backend == "numpy_nullable": + return "UInt8" + return "uint8" + if isinstance_or_issubclass(dtype, dtypes.String): + if dtype_backend == "pyarrow": + return "string[pyarrow]" + elif dtype_backend == "numpy_nullable": + return "string" + return str + if isinstance_or_issubclass(dtype, dtypes.Boolean): + if dtype_backend == "pyarrow": + return "boolean[pyarrow]" + elif dtype_backend == "numpy_nullable": + return "boolean" + return "bool" + if isinstance_or_issubclass(dtype, dtypes.Categorical): + # TODO(Unassigned): is there no pyarrow-backed categorical? + # or at least, convert_dtypes(dtype_backend='pyarrow') doesn't + # convert to it? + return "category" + if isinstance_or_issubclass(dtype, dtypes.Datetime): + # Pandas does not support "ms" or "us" time units before version 2.0 + if implementation is Implementation.PANDAS and backend_version < ( + 2, + ): # pragma: no cover + dt_time_unit = "ns" + else: + dt_time_unit = dtype.time_unit + + if dtype_backend == "pyarrow": + tz_part = f", tz={tz}" if (tz := dtype.time_zone) else "" + return f"timestamp[{dt_time_unit}{tz_part}][pyarrow]" + else: + tz_part = f", {tz}" if (tz := dtype.time_zone) else "" + return f"datetime64[{dt_time_unit}{tz_part}]" + if isinstance_or_issubclass(dtype, dtypes.Duration): + if implementation is Implementation.PANDAS and backend_version < ( + 2, + ): # pragma: no cover + du_time_unit = "ns" + else: + du_time_unit = dtype.time_unit + return ( + f"duration[{du_time_unit}][pyarrow]" + if dtype_backend == "pyarrow" + else f"timedelta64[{du_time_unit}]" + ) + if isinstance_or_issubclass(dtype, dtypes.Date): + try: + import pyarrow as pa # ignore-banned-import + except ModuleNotFoundError: # pragma: no cover + msg = "'pyarrow>=11.0.0' is required for `Date` dtype." + return "date32[pyarrow]" + if isinstance_or_issubclass(dtype, dtypes.Enum): + if version is Version.V1: + msg = "Converting to Enum is not supported in narwhals.stable.v1" + raise NotImplementedError(msg) + if isinstance(dtype, dtypes.Enum): + ns = implementation.to_native_namespace() + return ns.CategoricalDtype(dtype.categories, ordered=True) + msg = "Can not cast / initialize Enum without categories present" + raise ValueError(msg) + + if isinstance_or_issubclass( + dtype, (dtypes.Struct, dtypes.Array, dtypes.List, dtypes.Time, dtypes.Binary) + ): + if implementation is Implementation.PANDAS and backend_version >= (2, 2): + try: + import pandas as pd + import pyarrow as pa # ignore-banned-import # noqa: F401 + except ImportError as exc: # pragma: no cover + msg = f"Unable to convert to {dtype} to to the following exception: {exc.msg}" + raise ImportError(msg) from exc + from narwhals._arrow.utils import ( + narwhals_to_native_dtype as arrow_narwhals_to_native_dtype, + ) + + return pd.ArrowDtype(arrow_narwhals_to_native_dtype(dtype, version=version)) + else: # pragma: no cover + msg = ( + f"Converting to {dtype} dtype is not supported for implementation " + f"{implementation} and version {version}." + ) + raise NotImplementedError(msg) + msg = f"Unknown dtype: {dtype}" # pragma: no cover + raise AssertionError(msg) + + +def align_series_full_broadcast(*series: PandasLikeSeries) -> list[PandasLikeSeries]: + # Ensure all of `series` have the same length and index. Scalars get broadcasted to + # the full length of the longest Series. This is useful when you need to construct a + # full Series anyway (e.g. `DataFrame.select`). It should not be used in binary operations, + # such as `nw.col('a') - nw.col('a').mean()`, because then it's more efficient to extract + # the right-hand-side's single element as a scalar. + native_namespace = series[0].__native_namespace__() + + lengths = [len(s) for s in series] + max_length = max(lengths) + + idx = series[lengths.index(max_length)].native.index + reindexed = [] + for s in series: + if s._broadcast: + reindexed.append( + s._with_native( + native_namespace.Series( + [s.native.iloc[0]] * max_length, + index=idx, + name=s.name, + dtype=s.native.dtype, + ) + ) + ) + + elif s.native.index is not idx: + reindexed.append( + s._with_native( + set_index( + s.native, + idx, + implementation=s._implementation, + backend_version=s._backend_version, + ) + ) + ) + else: + reindexed.append(s) + return reindexed + + +def int_dtype_mapper(dtype: Any) -> str: + if "pyarrow" in str(dtype): + return "Int64[pyarrow]" + if str(dtype).lower() != str(dtype): # pragma: no cover + return "Int64" + return "int64" + + +def calculate_timestamp_datetime( # noqa: C901, PLR0912 + s: pd.Series[int], original_time_unit: str, time_unit: str +) -> pd.Series[int]: + if original_time_unit == "ns": + if time_unit == "ns": + result = s + elif time_unit == "us": + result = s // 1_000 + else: + result = s // 1_000_000 + elif original_time_unit == "us": + if time_unit == "ns": + result = s * 1_000 + elif time_unit == "us": + result = s + else: + result = s // 1_000 + elif original_time_unit == "ms": + if time_unit == "ns": + result = s * 1_000_000 + elif time_unit == "us": + result = s * 1_000 + else: + result = s + elif original_time_unit == "s": + if time_unit == "ns": + result = s * 1_000_000_000 + elif time_unit == "us": + result = s * 1_000_000 + else: + result = s * 1_000 + else: # pragma: no cover + msg = f"unexpected time unit {original_time_unit}, please report a bug at https://github.com/narwhals-dev/narwhals" + raise AssertionError(msg) + return result + + +def calculate_timestamp_date(s: pd.Series[int], time_unit: str) -> pd.Series[int]: + s = s * 86_400 # number of seconds in a day + if time_unit == "ns": + result = s * 1_000_000_000 + elif time_unit == "us": + result = s * 1_000_000 + else: + result = s * 1_000 + return result + + +def select_columns_by_name( + df: T, + column_names: list[str] | _1DArray, # NOTE: Cannot be a tuple! + backend_version: tuple[int, ...], + implementation: Implementation, +) -> T: + """Select columns by name. + + Prefer this over `df.loc[:, column_names]` as it's + generally more performant. + """ + if len(column_names) == df.shape[1] and all(column_names == df.columns): # type: ignore[attr-defined] + return df + if (df.columns.dtype.kind == "b") or ( # type: ignore[attr-defined] + implementation is Implementation.PANDAS and backend_version < (1, 5) + ): + # See https://github.com/narwhals-dev/narwhals/issues/1349#issuecomment-2470118122 + # for why we need this + if error := check_columns_exist( + column_names, # type: ignore[arg-type] + available=df.columns.tolist(), # type: ignore[attr-defined] + ): + raise error + return df.loc[:, column_names] # type: ignore[attr-defined] + try: + return df[column_names] # type: ignore[index] + except KeyError as e: + if error := check_columns_exist( + column_names, # type: ignore[arg-type] + available=df.columns.tolist(), # type: ignore[attr-defined] + ): + raise error from e + raise + + +def check_column_names_are_unique(columns: pd.Index[str]) -> None: + try: + len_unique_columns = len(columns.drop_duplicates()) + except Exception: # noqa: BLE001 # pragma: no cover + msg = f"Expected hashable (e.g. str or int) column names, got: {columns}" + raise ValueError(msg) from None + + if len(columns) != len_unique_columns: + from collections import Counter + + counter = Counter(columns) + msg = "" + for key, value in counter.items(): + if value > 1: + msg += f"\n- '{key}' {value} times" + msg = f"Expected unique column names, got:{msg}" + raise DuplicateError(msg) + + +class PandasLikeSeriesNamespace(EagerSeriesNamespace["PandasLikeSeries", Any]): + @property + def implementation(self) -> Implementation: + return self.compliant._implementation + + @property + def backend_version(self) -> tuple[int, ...]: + return self.compliant._backend_version + + @property + def version(self) -> Version: + return self.compliant._version |
