From 5bf22fc7e3c392c8bd44315ca2d06d7dca7d084e Mon Sep 17 00:00:00 2001 From: sotech117 Date: Thu, 31 Jul 2025 17:27:24 -0400 Subject: add code for analysis of data --- .../python3.8/site-packages/narwhals/dataframe.py | 3234 ++++++++++++++++++++ 1 file changed, 3234 insertions(+) create mode 100644 venv/lib/python3.8/site-packages/narwhals/dataframe.py (limited to 'venv/lib/python3.8/site-packages/narwhals/dataframe.py') diff --git a/venv/lib/python3.8/site-packages/narwhals/dataframe.py b/venv/lib/python3.8/site-packages/narwhals/dataframe.py new file mode 100644 index 0000000..b0ff471 --- /dev/null +++ b/venv/lib/python3.8/site-packages/narwhals/dataframe.py @@ -0,0 +1,3234 @@ +from __future__ import annotations + +from abc import abstractmethod +from itertools import chain +from typing import ( + TYPE_CHECKING, + Any, + Callable, + Generic, + Iterable, + Iterator, + Literal, + NoReturn, + Sequence, + TypeVar, + overload, +) +from warnings import warn + +from narwhals._expression_parsing import ( + ExprKind, + all_exprs_are_scalar_like, + check_expressions_preserve_length, + is_scalar_like, +) +from narwhals._utils import ( + Implementation, + find_stacklevel, + flatten, + generate_repr, + is_compliant_dataframe, + is_compliant_lazyframe, + is_index_selector, + is_list_of, + is_sequence_like, + is_slice_none, + issue_deprecation_warning, + parse_version, + supports_arrow_c_stream, +) +from narwhals.dependencies import get_polars, is_numpy_array +from narwhals.exceptions import ( + InvalidIntoExprError, + LengthChangingExprError, + OrderDependentExprError, +) +from narwhals.schema import Schema +from narwhals.series import Series +from narwhals.translate import to_native + +if TYPE_CHECKING: + from io import BytesIO + from pathlib import Path + from types import ModuleType + + import pandas as pd + import polars as pl + import pyarrow as pa + from typing_extensions import Concatenate, ParamSpec, Self, TypeAlias + + from narwhals._compliant import CompliantDataFrame, CompliantLazyFrame + from narwhals._compliant.typing import CompliantExprAny, EagerNamespaceAny + from narwhals.group_by import GroupBy, LazyGroupBy + from narwhals.typing import ( + AsofJoinStrategy, + IntoDataFrame, + IntoExpr, + IntoFrame, + JoinStrategy, + LazyUniqueKeepStrategy, + MultiColSelector as _MultiColSelector, + MultiIndexSelector as _MultiIndexSelector, + PivotAgg, + SingleColSelector, + SingleIndexSelector, + SizeUnit, + UniqueKeepStrategy, + _2DArray, + ) + + PS = ParamSpec("PS") + +_FrameT = TypeVar("_FrameT", bound="IntoFrame") +FrameT = TypeVar("FrameT", bound="IntoFrame") +DataFrameT = TypeVar("DataFrameT", bound="IntoDataFrame") +R = TypeVar("R") + +MultiColSelector: TypeAlias = "_MultiColSelector[Series[Any]]" +MultiIndexSelector: TypeAlias = "_MultiIndexSelector[Series[Any]]" + + +class BaseFrame(Generic[_FrameT]): + _compliant_frame: Any + _level: Literal["full", "lazy", "interchange"] + + def __native_namespace__(self) -> ModuleType: + return self._compliant_frame.__native_namespace__() # type: ignore[no-any-return] + + def __narwhals_namespace__(self) -> Any: + return self._compliant_frame.__narwhals_namespace__() + + def _with_compliant(self, df: Any) -> Self: + # construct, preserving properties + return self.__class__(df, level=self._level) # type: ignore[call-arg] + + def _flatten_and_extract( + self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr + ) -> tuple[list[CompliantExprAny], list[ExprKind]]: + """Process `args` and `kwargs`, extracting underlying objects as we go, interpreting strings as column names.""" + out_exprs = [] + out_kinds = [] + for expr in flatten(exprs): + compliant_expr = self._extract_compliant(expr) + out_exprs.append(compliant_expr) + out_kinds.append(ExprKind.from_into_expr(expr, str_as_lit=False)) + for alias, expr in named_exprs.items(): + compliant_expr = self._extract_compliant(expr).alias(alias) + out_exprs.append(compliant_expr) + out_kinds.append(ExprKind.from_into_expr(expr, str_as_lit=False)) + return out_exprs, out_kinds + + @abstractmethod + def _extract_compliant(self, arg: Any) -> Any: + raise NotImplementedError + + @property + def schema(self) -> Schema: + return Schema(self._compliant_frame.schema.items()) + + def collect_schema(self) -> Schema: + native_schema = dict(self._compliant_frame.collect_schema()) + + return Schema(native_schema) + + def pipe( + self, + function: Callable[Concatenate[Self, PS], R], + *args: PS.args, + **kwargs: PS.kwargs, + ) -> R: + return function(self, *args, **kwargs) + + def with_row_index(self, name: str = "index") -> Self: + return self._with_compliant(self._compliant_frame.with_row_index(name)) + + def drop_nulls(self, subset: str | list[str] | None) -> Self: + subset = [subset] if isinstance(subset, str) else subset + return self._with_compliant(self._compliant_frame.drop_nulls(subset=subset)) + + @property + def columns(self) -> list[str]: + return self._compliant_frame.columns # type: ignore[no-any-return] + + def with_columns( + self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr + ) -> Self: + compliant_exprs, kinds = self._flatten_and_extract(*exprs, **named_exprs) + compliant_exprs = [ + compliant_expr.broadcast(kind) if is_scalar_like(kind) else compliant_expr + for compliant_expr, kind in zip(compliant_exprs, kinds) + ] + return self._with_compliant(self._compliant_frame.with_columns(*compliant_exprs)) + + def select( + self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr + ) -> Self: + flat_exprs = tuple(flatten(exprs)) + if flat_exprs and all(isinstance(x, str) for x in flat_exprs) and not named_exprs: + # fast path! + try: + return self._with_compliant( + self._compliant_frame.simple_select(*flat_exprs) + ) + except Exception as e: + # Column not found is the only thing that can realistically be raised here. + if error := self._compliant_frame._check_columns_exist(flat_exprs): + raise error from e + raise + compliant_exprs, kinds = self._flatten_and_extract(*flat_exprs, **named_exprs) + if compliant_exprs and all_exprs_are_scalar_like(*flat_exprs, **named_exprs): + return self._with_compliant(self._compliant_frame.aggregate(*compliant_exprs)) + compliant_exprs = [ + compliant_expr.broadcast(kind) if is_scalar_like(kind) else compliant_expr + for compliant_expr, kind in zip(compliant_exprs, kinds) + ] + return self._with_compliant(self._compliant_frame.select(*compliant_exprs)) + + def rename(self, mapping: dict[str, str]) -> Self: + return self._with_compliant(self._compliant_frame.rename(mapping)) + + def head(self, n: int) -> Self: + return self._with_compliant(self._compliant_frame.head(n)) + + def tail(self, n: int) -> Self: + return self._with_compliant(self._compliant_frame.tail(n)) + + def drop(self, *columns: Iterable[str], strict: bool) -> Self: + return self._with_compliant(self._compliant_frame.drop(columns, strict=strict)) + + def filter( + self, *predicates: IntoExpr | Iterable[IntoExpr] | list[bool], **constraints: Any + ) -> Self: + if len(predicates) == 1 and is_list_of(predicates[0], bool): + predicate = predicates[0] + else: + from narwhals.functions import col + + flat_predicates = flatten(predicates) + check_expressions_preserve_length(*flat_predicates, function_name="filter") + plx = self.__narwhals_namespace__() + compliant_predicates, _kinds = self._flatten_and_extract(*flat_predicates) + compliant_constraints = ( + (col(name) == v)._to_compliant_expr(plx) + for name, v in constraints.items() + ) + predicate = plx.all_horizontal( + *chain(compliant_predicates, compliant_constraints) + ) + return self._with_compliant(self._compliant_frame.filter(predicate)) + + def sort( + self, + by: str | Iterable[str], + *more_by: str, + descending: bool | Sequence[bool] = False, + nulls_last: bool = False, + ) -> Self: + by = flatten([*flatten([by]), *more_by]) + return self._with_compliant( + self._compliant_frame.sort(*by, descending=descending, nulls_last=nulls_last) + ) + + def join( + self, + other: Self, + on: str | list[str] | None = None, + how: JoinStrategy = "inner", + *, + left_on: str | list[str] | None = None, + right_on: str | list[str] | None = None, + suffix: str = "_right", + ) -> Self: + on = [on] if isinstance(on, str) else on + left_on = [left_on] if isinstance(left_on, str) else left_on + right_on = [right_on] if isinstance(right_on, str) else right_on + + if how not in ( + _supported_joins := ("inner", "left", "full", "cross", "anti", "semi") + ): + msg = f"Only the following join strategies are supported: {_supported_joins}; found '{how}'." + raise NotImplementedError(msg) + + if how == "cross" and ( + left_on is not None or right_on is not None or on is not None + ): + msg = "Can not pass `left_on`, `right_on` or `on` keys for cross join" + raise ValueError(msg) + + if how != "cross" and (on is None and (left_on is None or right_on is None)): + msg = f"Either (`left_on` and `right_on`) or `on` keys should be specified for {how}." + raise ValueError(msg) + + if how != "cross" and ( + on is not None and (left_on is not None or right_on is not None) + ): + msg = f"If `on` is specified, `left_on` and `right_on` should be None for {how}." + raise ValueError(msg) + + if on is not None: + left_on = right_on = on + + if (isinstance(left_on, list) and isinstance(right_on, list)) and ( + len(left_on) != len(right_on) + ): + msg = "`left_on` and `right_on` must have the same length." + raise ValueError(msg) + + return self._with_compliant( + self._compliant_frame.join( + self._extract_compliant(other), + how=how, + left_on=left_on, + right_on=right_on, + suffix=suffix, + ) + ) + + def gather_every(self, n: int, offset: int = 0) -> Self: + return self._with_compliant( + self._compliant_frame.gather_every(n=n, offset=offset) + ) + + def join_asof( # noqa: C901 + self, + other: Self, + *, + left_on: str | None = None, + right_on: str | None = None, + on: str | None = None, + by_left: str | list[str] | None = None, + by_right: str | list[str] | None = None, + by: str | list[str] | None = None, + strategy: AsofJoinStrategy = "backward", + suffix: str = "_right", + ) -> Self: + _supported_strategies = ("backward", "forward", "nearest") + + if strategy not in _supported_strategies: + msg = f"Only the following strategies are supported: {_supported_strategies}; found '{strategy}'." + raise NotImplementedError(msg) + + if (on is None) and (left_on is None or right_on is None): + msg = "Either (`left_on` and `right_on`) or `on` keys should be specified." + raise ValueError(msg) + if (on is not None) and (left_on is not None or right_on is not None): + msg = "If `on` is specified, `left_on` and `right_on` should be None." + raise ValueError(msg) + if (by is None) and ( + (by_left is None and by_right is not None) + or (by_left is not None and by_right is None) + ): + msg = ( + "Can not specify only `by_left` or `by_right`, you need to specify both." + ) + raise ValueError(msg) + if (by is not None) and (by_left is not None or by_right is not None): + msg = "If `by` is specified, `by_left` and `by_right` should be None." + raise ValueError(msg) + if on is not None: + left_on = right_on = on + if by is not None: + by_left = by_right = by + if isinstance(by_left, str): + by_left = [by_left] + if isinstance(by_right, str): + by_right = [by_right] + + if (isinstance(by_left, list) and isinstance(by_right, list)) and ( + len(by_left) != len(by_right) + ): + msg = "`by_left` and `by_right` must have the same length." + raise ValueError(msg) + + return self._with_compliant( + self._compliant_frame.join_asof( + self._extract_compliant(other), + left_on=left_on, + right_on=right_on, + by_left=by_left, + by_right=by_right, + strategy=strategy, + suffix=suffix, + ) + ) + + def unpivot( + self, + on: str | list[str] | None, + *, + index: str | list[str] | None, + variable_name: str, + value_name: str, + ) -> Self: + on = [on] if isinstance(on, str) else on + index = [index] if isinstance(index, str) else index + + return self._with_compliant( + self._compliant_frame.unpivot( + on=on, index=index, variable_name=variable_name, value_name=value_name + ) + ) + + def __neq__(self, other: object) -> NoReturn: + msg = ( + "DataFrame.__neq__ and LazyFrame.__neq__ are not implemented, please " + "use expressions instead.\n\n" + "Hint: instead of\n" + " df != 0\n" + "you may want to use\n" + " df.select(nw.all() != 0)" + ) + raise NotImplementedError(msg) + + def __eq__(self, other: object) -> NoReturn: + msg = ( + "DataFrame.__eq__ and LazyFrame.__eq__ are not implemented, please " + "use expressions instead.\n\n" + "Hint: instead of\n" + " df == 0\n" + "you may want to use\n" + " df.select(nw.all() == 0)" + ) + raise NotImplementedError(msg) + + def explode(self, columns: str | Sequence[str], *more_columns: str) -> Self: + to_explode = ( + [columns, *more_columns] + if isinstance(columns, str) + else [*columns, *more_columns] + ) + + return self._with_compliant(self._compliant_frame.explode(columns=to_explode)) + + +class DataFrame(BaseFrame[DataFrameT]): + """Narwhals DataFrame, backed by a native eager dataframe. + + Warning: + This class is not meant to be instantiated directly - instead: + + - If the native object is a eager dataframe from one of the supported + backend (e.g. pandas.DataFrame, polars.DataFrame, pyarrow.Table), + you can use [`narwhals.from_native`][]: + ```py + narwhals.from_native(native_dataframe) + narwhals.from_native(native_dataframe, eager_only=True) + ``` + + - If the object is a dictionary of column names and generic sequences mapping + (e.g. `dict[str, list]`), you can create a DataFrame via + [`narwhals.from_dict`][]: + ```py + narwhals.from_dict( + data={"a": [1, 2, 3]}, + backend=narwhals.get_native_namespace(another_object), + ) + ``` + """ + + def _extract_compliant(self, arg: Any) -> Any: + from narwhals.expr import Expr + from narwhals.series import Series + + plx: EagerNamespaceAny = self.__narwhals_namespace__() + if isinstance(arg, BaseFrame): + return arg._compliant_frame + if isinstance(arg, Series): + return arg._compliant_series._to_expr() + if isinstance(arg, Expr): + return arg._to_compliant_expr(self.__narwhals_namespace__()) + if isinstance(arg, str): + return plx.col(arg) + if get_polars() is not None and "polars" in str(type(arg)): # pragma: no cover + msg = ( + f"Expected Narwhals object, got: {type(arg)}.\n\n" + "Perhaps you:\n" + "- Forgot a `nw.from_native` somewhere?\n" + "- Used `pl.col` instead of `nw.col`?" + ) + raise TypeError(msg) + if is_numpy_array(arg): + return plx._series.from_numpy(arg, context=plx)._to_expr() + raise InvalidIntoExprError.from_invalid_type(type(arg)) + + @property + def _series(self) -> type[Series[Any]]: + return Series + + @property + def _lazyframe(self) -> type[LazyFrame[Any]]: + return LazyFrame + + def __init__(self, df: Any, *, level: Literal["full", "lazy", "interchange"]) -> None: + self._level: Literal["full", "lazy", "interchange"] = level + # NOTE: Interchange support (`DataFrameLike`) is the source of the error + self._compliant_frame: CompliantDataFrame[Any, Any, DataFrameT, Self] # type: ignore[type-var] + if is_compliant_dataframe(df): + self._compliant_frame = df.__narwhals_dataframe__() + else: # pragma: no cover + msg = f"Expected an object which implements `__narwhals_dataframe__`, got: {type(df)}" + raise AssertionError(msg) + + @property + def implementation(self) -> Implementation: + """Return implementation of native frame. + + This can be useful when you need to use special-casing for features outside of + Narwhals' scope - for example, when dealing with pandas' Period Dtype. + + Returns: + Implementation. + + Examples: + >>> import narwhals as nw + >>> import pandas as pd + >>> df_native = pd.DataFrame({"a": [1, 2, 3]}) + >>> df = nw.from_native(df_native) + >>> df.implementation + + >>> df.implementation.is_pandas() + True + >>> df.implementation.is_pandas_like() + True + >>> df.implementation.is_polars() + False + """ + return self._compliant_frame._implementation + + def __len__(self) -> int: + return self._compliant_frame.__len__() + + def __array__(self, dtype: Any = None, copy: bool | None = None) -> _2DArray: # noqa: FBT001 + return self._compliant_frame.__array__(dtype, copy=copy) + + def __repr__(self) -> str: # pragma: no cover + return generate_repr("Narwhals DataFrame", self.to_native().__repr__()) + + def __arrow_c_stream__(self, requested_schema: object | None = None) -> object: + """Export a DataFrame via the Arrow PyCapsule Interface. + + - if the underlying dataframe implements the interface, it'll return that + - else, it'll call `to_arrow` and then defer to PyArrow's implementation + + See [PyCapsule Interface](https://arrow.apache.org/docs/dev/format/CDataInterface/PyCapsuleInterface.html) + for more. + """ + native_frame = self._compliant_frame._native_frame + if supports_arrow_c_stream(native_frame): + return native_frame.__arrow_c_stream__(requested_schema=requested_schema) + try: + import pyarrow as pa # ignore-banned-import + except ModuleNotFoundError as exc: # pragma: no cover + msg = f"'pyarrow>=14.0.0' is required for `DataFrame.__arrow_c_stream__` for object of type {type(native_frame)}" + raise ModuleNotFoundError(msg) from exc + if parse_version(pa) < (14, 0): # pragma: no cover + msg = f"'pyarrow>=14.0.0' is required for `DataFrame.__arrow_c_stream__` for object of type {type(native_frame)}" + raise ModuleNotFoundError(msg) from None + pa_table = self.to_arrow() + return pa_table.__arrow_c_stream__(requested_schema=requested_schema) # type: ignore[no-untyped-call] + + def lazy( + self, backend: ModuleType | Implementation | str | None = None + ) -> LazyFrame[Any]: + """Restrict available API methods to lazy-only ones. + + If `backend` is specified, then a conversion between different backends + might be triggered. + + If a library does not support lazy execution and `backend` is not specified, + then this is will only restrict the API to lazy-only operations. This is useful + if you want to ensure that you write dataframe-agnostic code which all has + the possibility of running entirely lazily. + + Arguments: + backend: Which lazy backend collect to. This will be the underlying + backend for the resulting Narwhals LazyFrame. If not specified, and the + given library does not support lazy execution, then this will restrict + the API to lazy-only operations. + + `backend` can be specified in various ways + + - As `Implementation.` with `BACKEND` being `DASK`, `DUCKDB` + or `POLARS`. + - As a string: `"dask"`, `"duckdb"` or `"polars"` + - Directly as a module `dask.dataframe`, `duckdb` or `polars`. + + Returns: + A new LazyFrame. + + Examples: + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> df_native = pl.DataFrame({"a": [1, 2], "b": [4, 6]}) + >>> df = nw.from_native(df_native) + + If we call `df.lazy`, we get a `narwhals.LazyFrame` backed by a Polars + LazyFrame. + + >>> df.lazy() # doctest: +SKIP + ┌─────────────────────────────┐ + | Narwhals LazyFrame | + |-----------------------------| + || + └─────────────────────────────┘ + + We can also pass DuckDB as the backend, and then we'll get a + `narwhals.LazyFrame` backed by a `duckdb.DuckDBPyRelation`. + + >>> df.lazy(backend=nw.Implementation.DUCKDB) + ┌──────────────────┐ + |Narwhals LazyFrame| + |------------------| + |┌───────┬───────┐ | + |│ a │ b │ | + |│ int64 │ int64 │ | + |├───────┼───────┤ | + |│ 1 │ 4 │ | + |│ 2 │ 6 │ | + |└───────┴───────┘ | + └──────────────────┘ + """ + lazy_backend = None if backend is None else Implementation.from_backend(backend) + supported_lazy_backends = ( + Implementation.DASK, + Implementation.DUCKDB, + Implementation.POLARS, + ) + if lazy_backend is not None and lazy_backend not in supported_lazy_backends: + msg = ( + "Not-supported backend." + f"\n\nExpected one of {supported_lazy_backends} or `None`, got {lazy_backend}" + ) + raise ValueError(msg) + return self._lazyframe( + self._compliant_frame.lazy(backend=lazy_backend), level="lazy" + ) + + def to_native(self) -> DataFrameT: + """Convert Narwhals DataFrame to native one. + + Returns: + Object of class that user started with. + + Examples: + >>> import pandas as pd + >>> import narwhals as nw + >>> df_native = pd.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6.0, 7.0, 8.0], "ham": ["a", "b", "c"]} + ... ) + + Calling `to_native` on a Narwhals DataFrame returns the native object: + + >>> nw.from_native(df_native).to_native() + foo bar ham + 0 1 6.0 a + 1 2 7.0 b + 2 3 8.0 c + """ + return self._compliant_frame._native_frame + + def to_pandas(self) -> pd.DataFrame: + """Convert this DataFrame to a pandas DataFrame. + + Returns: + A pandas DataFrame. + + Examples: + >>> import polars as pl + >>> import narwhals as nw + >>> df_native = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6.0, 7.0, 8.0], "ham": ["a", "b", "c"]} + ... ) + >>> df = nw.from_native(df_native) + >>> df.to_pandas() + foo bar ham + 0 1 6.0 a + 1 2 7.0 b + 2 3 8.0 c + """ + return self._compliant_frame.to_pandas() + + def to_polars(self) -> pl.DataFrame: + """Convert this DataFrame to a polars DataFrame. + + Returns: + A polars DataFrame. + + Examples: + >>> import pyarrow as pa + >>> import narwhals as nw + >>> df_native = pa.table({"foo": [1, 2], "bar": [6.0, 7.0]}) + >>> df = nw.from_native(df_native) + >>> df.to_polars() + shape: (2, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + └─────┴─────┘ + """ + return self._compliant_frame.to_polars() + + @overload + def write_csv(self, file: None = None) -> str: ... + + @overload + def write_csv(self, file: str | Path | BytesIO) -> None: ... + + def write_csv(self, file: str | Path | BytesIO | None = None) -> str | None: + r"""Write dataframe to comma-separated values (CSV) file. + + Arguments: + file: String, path object or file-like object to which the dataframe will be + written. If None, the resulting csv format is returned as a string. + + Returns: + String or None. + + Examples: + >>> import pandas as pd + >>> import narwhals as nw + >>> df_native = pd.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6.0, 7.0, 8.0], "ham": ["a", "b", "c"]} + ... ) + >>> df = nw.from_native(df_native) + >>> df.write_csv() + 'foo,bar,ham\n1,6.0,a\n2,7.0,b\n3,8.0,c\n' + + If we had passed a file name to `write_csv`, it would have been + written to that file. + """ + return self._compliant_frame.write_csv(file) + + def write_parquet(self, file: str | Path | BytesIO) -> None: + """Write dataframe to parquet file. + + Arguments: + file: String, path object or file-like object to which the dataframe will be + written. + + Returns: + None. + + Examples: + >>> import pyarrow as pa + >>> import narwhals as nw + >>> df_native = pa.table({"foo": [1, 2], "bar": [6.0, 7.0]}) + >>> df = nw.from_native(df_native) + >>> df.write_parquet("out.parquet") # doctest:+SKIP + """ + self._compliant_frame.write_parquet(file) + + def to_numpy(self) -> _2DArray: + """Convert this DataFrame to a NumPy ndarray. + + Returns: + A NumPy ndarray array. + + Examples: + >>> import pandas as pd + >>> import narwhals as nw + >>> df_native = pd.DataFrame({"foo": [1, 2], "bar": [6.5, 7.0]}) + >>> df = nw.from_native(df_native) + >>> df.to_numpy() + array([[1. , 6.5], + [2. , 7. ]]) + """ + return self._compliant_frame.to_numpy(None, copy=None) + + @property + def shape(self) -> tuple[int, int]: + """Get the shape of the DataFrame. + + Returns: + The shape of the dataframe as a tuple. + + Examples: + >>> import pandas as pd + >>> import narwhals as nw + >>> df_native = pd.DataFrame({"foo": [1, 2]}) + >>> df = nw.from_native(df_native) + >>> df.shape + (2, 1) + """ + return self._compliant_frame.shape + + def get_column(self, name: str) -> Series[Any]: + """Get a single column by name. + + Arguments: + name: The column name as a string. + + Returns: + A Narwhals Series, backed by a native series. + + Notes: + Although `name` is typed as `str`, pandas does allow non-string column + names, and they will work when passed to this function if the + `narwhals.DataFrame` is backed by a pandas dataframe with non-string + columns. This function can only be used to extract a column by name, so + there is no risk of ambiguity. + + Examples: + >>> import pandas as pd + >>> import narwhals as nw + >>> df_native = pd.DataFrame({"a": [1, 2]}) + >>> df = nw.from_native(df_native) + >>> df.get_column("a").to_native() + 0 1 + 1 2 + Name: a, dtype: int64 + """ + return self._series(self._compliant_frame.get_column(name), level=self._level) + + def estimated_size(self, unit: SizeUnit = "b") -> int | float: + """Return an estimation of the total (heap) allocated size of the `DataFrame`. + + Estimated size is given in the specified unit (bytes by default). + + Arguments: + unit: 'b', 'kb', 'mb', 'gb', 'tb', 'bytes', 'kilobytes', 'megabytes', + 'gigabytes', or 'terabytes'. + + Returns: + Integer or Float. + + Examples: + >>> import pyarrow as pa + >>> import narwhals as nw + >>> df_native = pa.table({"foo": [1, 2], "bar": [6.0, 7.0]}) + >>> df = nw.from_native(df_native) + >>> df.estimated_size() + 32 + """ + return self._compliant_frame.estimated_size(unit=unit) + + # `str` overlaps with `Sequence[str]` + # We can ignore this but we must keep this overload ordering + @overload + def __getitem__(self, item: tuple[SingleIndexSelector, SingleColSelector]) -> Any: ... + + @overload + def __getitem__( # type: ignore[overload-overlap] + self, item: str | tuple[MultiIndexSelector, SingleColSelector] + ) -> Series[Any]: ... + + @overload + def __getitem__( + self, + item: ( + SingleIndexSelector + | MultiIndexSelector + | MultiColSelector + | tuple[SingleIndexSelector, MultiColSelector] + | tuple[MultiIndexSelector, MultiColSelector] + ), + ) -> Self: ... + def __getitem__( # noqa: C901, PLR0912 + self, + item: ( + SingleIndexSelector + | SingleColSelector + | MultiColSelector + | MultiIndexSelector + | tuple[SingleIndexSelector, SingleColSelector] + | tuple[SingleIndexSelector, MultiColSelector] + | tuple[MultiIndexSelector, SingleColSelector] + | tuple[MultiIndexSelector, MultiColSelector] + ), + ) -> Series[Any] | Self | Any: + """Extract column or slice of DataFrame. + + Arguments: + item: How to slice dataframe. What happens depends on what is passed. It's easiest + to explain by example. Suppose we have a Dataframe `df` + + - `df['a']` extracts column `'a'` and returns a `Series`. + - `df[0:2]` extracts the first two rows and returns a `DataFrame`. + - `df[0:2, 'a']` extracts the first two rows from column `'a'` and returns + a `Series`. + - `df[0:2, 0]` extracts the first two rows from the first column and returns + a `Series`. + - `df[[0, 1], [0, 1, 2]]` extracts the first two rows and the first three columns + and returns a `DataFrame` + - `df[:, [0, 1, 2]]` extracts all rows from the first three columns and returns a + `DataFrame`. + - `df[:, ['a', 'c']]` extracts all rows and columns `'a'` and `'c'` and returns a + `DataFrame`. + - `df[['a', 'c']]` extracts all rows and columns `'a'` and `'c'` and returns a + `DataFrame`. + - `df[0: 2, ['a', 'c']]` extracts the first two rows and columns `'a'` and `'c'` and + returns a `DataFrame` + - `df[:, 0: 2]` extracts all rows from the first two columns and returns a `DataFrame` + - `df[:, 'a': 'c']` extracts all rows and all columns positioned between `'a'` and `'c'` + _inclusive_ and returns a `DataFrame`. For example, if the columns are + `'a', 'd', 'c', 'b'`, then that would extract columns `'a'`, `'d'`, and `'c'`. + + Returns: + A Narwhals Series, backed by a native series. + + Notes: + - Integers are always interpreted as positions + - Strings are always interpreted as column names. + + In contrast with Polars, pandas allows non-string column names. + If you don't know whether the column name you're trying to extract + is definitely a string (e.g. `df[df.columns[0]]`) then you should + use `DataFrame.get_column` instead. + + Examples: + >>> import pandas as pd + >>> import narwhals as nw + >>> df_native = pd.DataFrame({"a": [1, 2]}) + >>> df = nw.from_native(df_native) + >>> df["a"].to_native() + 0 1 + 1 2 + Name: a, dtype: int64 + """ + from narwhals.series import Series + + msg = ( + f"Unexpected type for `DataFrame.__getitem__`, got: {type(item)}.\n\n" + "Hints:\n" + "- use `df.item` to select a single item.\n" + "- Use `df[indices, :]` to select rows positionally.\n" + "- Use `df.filter(mask)` to filter rows based on a boolean mask." + ) + + if isinstance(item, tuple): + if len(item) > 2: + tuple_msg = ( + "Tuples cannot be passed to DataFrame.__getitem__ directly.\n\n" + "Hint: instead of `df[indices]`, did you mean `df[indices, :]`?" + ) + raise TypeError(tuple_msg) + rows = None if not item or is_slice_none(item[0]) else item[0] + columns = None if len(item) < 2 or is_slice_none(item[1]) else item[1] + if rows is None and columns is None: + return self + elif is_index_selector(item): + rows = item + columns = None + elif is_sequence_like(item) or isinstance(item, (slice, str)): + rows = None + columns = item + else: + raise TypeError(msg) + + if isinstance(rows, str): + raise TypeError(msg) + + compliant = self._compliant_frame + + if isinstance(columns, (int, str)): + if isinstance(rows, int): + return self.item(rows, columns) + col_name = columns if isinstance(columns, str) else self.columns[columns] + series = self.get_column(col_name) + return series[rows] if rows is not None else series + if isinstance(rows, Series): + rows = rows._compliant_series + if isinstance(columns, Series): + columns = columns._compliant_series + if rows is None: + return self._with_compliant(compliant[:, columns]) + if columns is None: + return self._with_compliant(compliant[rows, :]) + return self._with_compliant(compliant[rows, columns]) + + def __contains__(self, key: str) -> bool: + return key in self.columns + + @overload + def to_dict(self, *, as_series: Literal[True] = ...) -> dict[str, Series[Any]]: ... + @overload + def to_dict(self, *, as_series: Literal[False]) -> dict[str, list[Any]]: ... + @overload + def to_dict( + self, *, as_series: bool + ) -> dict[str, Series[Any]] | dict[str, list[Any]]: ... + def to_dict( + self, *, as_series: bool = True + ) -> dict[str, Series[Any]] | dict[str, list[Any]]: + """Convert DataFrame to a dictionary mapping column name to values. + + Arguments: + as_series: If set to true ``True``, then the values are Narwhals Series, + otherwise the values are Any. + + Returns: + A mapping from column name to values / Series. + + Examples: + >>> import pyarrow as pa + >>> import narwhals as nw + >>> df_native = pa.table({"A": [1, 2], "fruits": ["banana", "apple"]}) + >>> df = nw.from_native(df_native) + >>> df.to_dict(as_series=False) + {'A': [1, 2], 'fruits': ['banana', 'apple']} + """ + if as_series: + return { + key: self._series(value, level=self._level) + for key, value in self._compliant_frame.to_dict( + as_series=as_series + ).items() + } + return self._compliant_frame.to_dict(as_series=as_series) + + def row(self, index: int) -> tuple[Any, ...]: + """Get values at given row. + + Warning: + You should NEVER use this method to iterate over a DataFrame; + if you require row-iteration you should strongly prefer use of iter_rows() + instead. + + Arguments: + index: Row number. + + Returns: + A tuple of the values in the selected row. + + Notes: + cuDF doesn't support this method. + + Examples: + >>> import pyarrow as pa + >>> import narwhals as nw + >>> df_native = pa.table({"a": [1, 2], "b": [4, 5]}) + >>> nw.from_native(df_native).row(1) + (, ) + """ + return self._compliant_frame.row(index) + + # inherited + def pipe( + self, + function: Callable[Concatenate[Self, PS], R], + *args: PS.args, + **kwargs: PS.kwargs, + ) -> R: + """Pipe function call. + + Arguments: + function: Function to apply. + args: Positional arguments to pass to function. + kwargs: Keyword arguments to pass to function. + + Returns: + The original object with the function applied. + + Examples: + >>> import pandas as pd + >>> import narwhals as nw + >>> df_native = pd.DataFrame({"a": [1, 2], "ba": [4, 5]}) + >>> nw.from_native(df_native).pipe( + ... lambda _df: _df.select( + ... [x for x in _df.columns if len(x) == 1] + ... ).to_native() + ... ) + a + 0 1 + 1 2 + """ + return super().pipe(function, *args, **kwargs) + + def drop_nulls(self, subset: str | list[str] | None = None) -> Self: + """Drop rows that contain null values. + + Arguments: + subset: Column name(s) for which null values are considered. If set to None + (default), use all columns. + + Returns: + The original object with the rows removed that contained the null values. + + Notes: + pandas handles null values differently from Polars and PyArrow. + See [null_handling](../concepts/null_handling.md) + for reference. + + Examples: + >>> import pyarrow as pa + >>> import narwhals as nw + >>> df_native = pa.table({"a": [1.0, None], "ba": [1.0, 2.0]}) + >>> nw.from_native(df_native).drop_nulls().to_native() + pyarrow.Table + a: double + ba: double + ---- + a: [[1]] + ba: [[1]] + """ + return super().drop_nulls(subset=subset) + + def with_row_index(self, name: str = "index") -> Self: + """Insert column which enumerates rows. + + Arguments: + name: The name of the column as a string. The default is "index". + + Returns: + The original object with the column added. + + Examples: + >>> import pyarrow as pa + >>> import narwhals as nw + >>> df_native = pa.table({"a": [1, 2], "b": [4, 5]}) + >>> nw.from_native(df_native).with_row_index().to_native() + pyarrow.Table + index: int64 + a: int64 + b: int64 + ---- + index: [[0,1]] + a: [[1,2]] + b: [[4,5]] + """ + return super().with_row_index(name) + + @property + def schema(self) -> Schema: + r"""Get an ordered mapping of column names to their data type. + + Returns: + A Narwhals Schema object that displays the mapping of column names. + + Examples: + >>> import pyarrow as pa + >>> import narwhals as nw + >>> df_native = pa.table({"foo": [1, 2], "bar": [6.0, 7.0]}) + >>> nw.from_native(df_native).schema + Schema({'foo': Int64, 'bar': Float64}) + """ + return super().schema + + def collect_schema(self) -> Schema: + r"""Get an ordered mapping of column names to their data type. + + Returns: + A Narwhals Schema object that displays the mapping of column names. + + Examples: + >>> import pyarrow as pa + >>> import narwhals as nw + >>> df_native = pa.table({"foo": [1, 2], "bar": [6.0, 7.0]}) + >>> nw.from_native(df_native).collect_schema() + Schema({'foo': Int64, 'bar': Float64}) + """ + return super().collect_schema() + + @property + def columns(self) -> list[str]: + """Get column names. + + Returns: + The column names stored in a list. + + Examples: + >>> import pyarrow as pa + >>> import narwhals as nw + >>> df_native = pa.table({"foo": [1, 2], "bar": [6.0, 7.0]}) + >>> nw.from_native(df_native).columns + ['foo', 'bar'] + """ + return super().columns + + @overload + def rows(self, *, named: Literal[False] = False) -> list[tuple[Any, ...]]: ... + + @overload + def rows(self, *, named: Literal[True]) -> list[dict[str, Any]]: ... + + @overload + def rows(self, *, named: bool) -> list[tuple[Any, ...]] | list[dict[str, Any]]: ... + + def rows( + self, *, named: bool = False + ) -> list[tuple[Any, ...]] | list[dict[str, Any]]: + """Returns all data in the DataFrame as a list of rows of python-native values. + + Arguments: + named: By default, each row is returned as a tuple of values given + in the same order as the frame columns. Setting named=True will + return rows of dictionaries instead. + + Returns: + The data as a list of rows. + + Examples: + >>> import pyarrow as pa + >>> import narwhals as nw + >>> df_native = pa.table({"foo": [1, 2], "bar": [6.0, 7.0]}) + >>> nw.from_native(df_native).rows() + [(1, 6.0), (2, 7.0)] + """ + return self._compliant_frame.rows(named=named) # type: ignore[return-value] + + def iter_columns(self) -> Iterator[Series[Any]]: + """Returns an iterator over the columns of this DataFrame. + + Yields: + A Narwhals Series, backed by a native series. + + Examples: + >>> import pandas as pd + >>> import narwhals as nw + >>> df_native = pd.DataFrame({"foo": [1, 2], "bar": [6.0, 7.0]}) + >>> iter_columns = nw.from_native(df_native).iter_columns() + >>> next(iter_columns) + ┌───────────────────────┐ + | Narwhals Series | + |-----------------------| + |0 1 | + |1 2 | + |Name: foo, dtype: int64| + └───────────────────────┘ + >>> next(iter_columns) + ┌─────────────────────────┐ + | Narwhals Series | + |-------------------------| + |0 6.0 | + |1 7.0 | + |Name: bar, dtype: float64| + └─────────────────────────┘ + """ + for series in self._compliant_frame.iter_columns(): + yield self._series(series, level=self._level) + + @overload + def iter_rows( + self, *, named: Literal[False], buffer_size: int = ... + ) -> Iterator[tuple[Any, ...]]: ... + + @overload + def iter_rows( + self, *, named: Literal[True], buffer_size: int = ... + ) -> Iterator[dict[str, Any]]: ... + + @overload + def iter_rows( + self, *, named: bool, buffer_size: int = ... + ) -> Iterator[tuple[Any, ...]] | Iterator[dict[str, Any]]: ... + + def iter_rows( + self, *, named: bool = False, buffer_size: int = 512 + ) -> Iterator[tuple[Any, ...]] | Iterator[dict[str, Any]]: + """Returns an iterator over the DataFrame of rows of python-native values. + + Arguments: + named: By default, each row is returned as a tuple of values given + in the same order as the frame columns. Setting named=True will + return rows of dictionaries instead. + buffer_size: Determines the number of rows that are buffered + internally while iterating over the data. + See https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.iter_rows.html + + Returns: + An iterator over the DataFrame of rows. + + Notes: + cuDF doesn't support this method. + + Examples: + >>> import pyarrow as pa + >>> import narwhals as nw + >>> df_native = pa.table({"foo": [1, 2], "bar": [6.0, 7.0]}) + >>> iter_rows = nw.from_native(df_native).iter_rows() + >>> next(iter_rows) + (1, 6.0) + >>> next(iter_rows) + (2, 7.0) + """ + return self._compliant_frame.iter_rows(named=named, buffer_size=buffer_size) # type: ignore[return-value] + + def with_columns( + self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr + ) -> Self: + r"""Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + Arguments: + *exprs: Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + + **named_exprs: Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns: + DataFrame: A new DataFrame with the columns added. + + Note: + Creating a new DataFrame using this method does not create a new copy of + existing data. + + Examples: + >>> import pandas as pd + >>> import narwhals as nw + >>> df_native = pd.DataFrame({"a": [1, 2], "b": [0.5, 4.0]}) + >>> ( + ... nw.from_native(df_native) + ... .with_columns((nw.col("a") * 2).alias("a*2")) + ... .to_native() + ... ) + a b a*2 + 0 1 0.5 2 + 1 2 4.0 4 + """ + return super().with_columns(*exprs, **named_exprs) + + def select( + self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr + ) -> Self: + r"""Select columns from this DataFrame. + + Arguments: + *exprs: Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + + **named_exprs: Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns: + The dataframe containing only the selected columns. + + Examples: + >>> import pyarrow as pa + >>> import narwhals as nw + >>> df_native = pa.table({"a": [1, 2], "b": [3, 4]}) + >>> nw.from_native(df_native).select("a", a_plus_1=nw.col("a") + 1) + ┌──────────────────┐ + |Narwhals DataFrame| + |------------------| + |pyarrow.Table | + |a: int64 | + |a_plus_1: int64 | + |---- | + |a: [[1,2]] | + |a_plus_1: [[2,3]] | + └──────────────────┘ + """ + return super().select(*exprs, **named_exprs) + + def rename(self, mapping: dict[str, str]) -> Self: + """Rename column names. + + Arguments: + mapping: Key value pairs that map from old name to new name. + + Returns: + The dataframe with the specified columns renamed. + + Examples: + >>> import pyarrow as pa + >>> import narwhals as nw + >>> df_native = pa.table({"foo": [1, 2], "bar": [6, 7]}) + >>> nw.from_native(df_native).rename({"foo": "apple"}).to_native() + pyarrow.Table + apple: int64 + bar: int64 + ---- + apple: [[1,2]] + bar: [[6,7]] + """ + return super().rename(mapping) + + def head(self, n: int = 5) -> Self: + """Get the first `n` rows. + + Arguments: + n: Number of rows to return. If a negative value is passed, return all rows + except the last `abs(n)`. + + Returns: + A subset of the dataframe of shape (n, n_columns). + + Examples: + >>> import pandas as pd + >>> import narwhals as nw + >>> df_native = pd.DataFrame({"a": [1, 2], "b": [0.5, 4.0]}) + >>> nw.from_native(df_native).head(1).to_native() + a b + 0 1 0.5 + """ + return super().head(n) + + def tail(self, n: int = 5) -> Self: + """Get the last `n` rows. + + Arguments: + n: Number of rows to return. If a negative value is passed, return all rows + except the first `abs(n)`. + + Returns: + A subset of the dataframe of shape (n, n_columns). + + Examples: + >>> import pandas as pd + >>> import narwhals as nw + >>> df_native = pd.DataFrame({"a": [1, 2], "b": [0.5, 4.0]}) + >>> nw.from_native(df_native).tail(1) + ┌──────────────────┐ + |Narwhals DataFrame| + |------------------| + | a b | + | 1 2 4.0 | + └──────────────────┘ + """ + return super().tail(n) + + def drop(self, *columns: str | Iterable[str], strict: bool = True) -> Self: + """Remove columns from the dataframe. + + Returns: + The dataframe with the specified columns removed. + + Arguments: + *columns: Names of the columns that should be removed from the dataframe. + strict: Validate that all column names exist in the schema and throw an + exception if a column name does not exist in the schema. + + Examples: + >>> import pandas as pd + >>> import narwhals as nw + >>> df_native = pd.DataFrame( + ... {"foo": [1, 2], "bar": [6.0, 7.0], "ham": ["a", "b"]} + ... ) + >>> nw.from_native(df_native).drop("ham").to_native() + foo bar + 0 1 6.0 + 1 2 7.0 + """ + return super().drop(*flatten(columns), strict=strict) + + def unique( + self, + subset: str | list[str] | None = None, + *, + keep: UniqueKeepStrategy = "any", + maintain_order: bool = False, + ) -> Self: + """Drop duplicate rows from this dataframe. + + Arguments: + subset: Column name(s) to consider when identifying duplicate rows. + keep: {'first', 'last', 'any', 'none'} + Which of the duplicate rows to keep. + + * 'any': Does not give any guarantee of which row is kept. + This allows more optimizations. + * 'none': Don't keep duplicate rows. + * 'first': Keep first unique row. + * 'last': Keep last unique row. + maintain_order: Keep the same order as the original DataFrame. This may be more + expensive to compute. + + Returns: + The dataframe with the duplicate rows removed. + + Examples: + >>> import pandas as pd + >>> import narwhals as nw + >>> df_native = pd.DataFrame( + ... {"foo": [1, 2], "bar": ["a", "a"], "ham": ["b", "b"]} + ... ) + >>> nw.from_native(df_native).unique(["bar", "ham"]).to_native() + foo bar ham + 0 1 a b + """ + if keep not in {"any", "none", "first", "last"}: + msg = f"Expected {'any', 'none', 'first', 'last'}, got: {keep}" + raise ValueError(msg) + if isinstance(subset, str): + subset = [subset] + return self._with_compliant( + self._compliant_frame.unique(subset, keep=keep, maintain_order=maintain_order) + ) + + def filter( + self, *predicates: IntoExpr | Iterable[IntoExpr] | list[bool], **constraints: Any + ) -> Self: + r"""Filter the rows in the DataFrame based on one or more predicate expressions. + + The original order of the remaining rows is preserved. + + Arguments: + *predicates: Expression(s) that evaluates to a boolean Series. Can + also be a (single!) boolean list. + **constraints: Column filters; use `name = value` to filter columns by the supplied value. + Each constraint will behave the same as `nw.col(name).eq(value)`, and will be implicitly + joined with the other filter conditions using &. + + Returns: + The filtered dataframe. + + Examples: + >>> import pandas as pd + >>> import narwhals as nw + >>> df_native = pd.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + + Filter on one condition + + >>> nw.from_native(df_native).filter(nw.col("foo") > 1).to_native() + foo bar ham + 1 2 7 b + 2 3 8 c + + Filter on multiple conditions with implicit `&` + + >>> nw.from_native(df_native).filter( + ... nw.col("foo") < 3, nw.col("ham") == "a" + ... ).to_native() + foo bar ham + 0 1 6 a + + Filter on multiple conditions with `|` + + >>> nw.from_native(df_native).filter( + ... (nw.col("foo") == 1) | (nw.col("ham") == "c") + ... ).to_native() + foo bar ham + 0 1 6 a + 2 3 8 c + + Filter using `**kwargs` syntax + + >>> nw.from_native(df_native).filter(foo=2, ham="b").to_native() + foo bar ham + 1 2 7 b + """ + return super().filter(*predicates, **constraints) + + @overload + def group_by( + self, *keys: IntoExpr | Iterable[IntoExpr], drop_null_keys: Literal[False] = ... + ) -> GroupBy[Self]: ... + + @overload + def group_by( + self, *keys: str | Iterable[str], drop_null_keys: Literal[True] + ) -> GroupBy[Self]: ... + + def group_by( + self, *keys: IntoExpr | Iterable[IntoExpr], drop_null_keys: bool = False + ) -> GroupBy[Self]: + r"""Start a group by operation. + + Arguments: + *keys: Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + drop_null_keys: if True, then groups where any key is null won't be included + in the result. + + Returns: + GroupBy: Object which can be used to perform aggregations. + + Examples: + >>> import pandas as pd + >>> import narwhals as nw + >>> df_native = pd.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + + Group by one column and compute the sum of another column + + >>> nw.from_native(df_native, eager_only=True).group_by("a").agg( + ... nw.col("b").sum() + ... ).sort("a").to_native() + a b + 0 a 2 + 1 b 5 + 2 c 3 + + Group by multiple columns and compute the max of another column + + >>> ( + ... nw.from_native(df_native, eager_only=True) + ... .group_by(["a", "b"]) + ... .agg(nw.max("c")) + ... .sort("a", "b") + ... .to_native() + ... ) + a b c + 0 a 1 5 + 1 b 2 4 + 2 b 3 2 + 3 c 3 1 + + Expressions are also accepted. + + >>> nw.from_native(df_native, eager_only=True).group_by( + ... "a", nw.col("b") // 2 + ... ).agg(nw.col("c").mean()).to_native() + a b c + 0 a 0 4.0 + 1 b 1 3.0 + 2 c 1 1.0 + """ + from narwhals.group_by import GroupBy + + flat_keys = flatten(keys) + + if all(isinstance(key, str) for key in flat_keys): + return GroupBy(self, flat_keys, drop_null_keys=drop_null_keys) + + from narwhals import col + from narwhals.expr import Expr + from narwhals.series import Series + + key_is_expr_or_series = tuple(isinstance(k, (Expr, Series)) for k in flat_keys) + + if drop_null_keys and any(key_is_expr_or_series): + msg = "drop_null_keys cannot be True when keys contains Expr or Series" + raise NotImplementedError(msg) + + _keys = [ + k if is_expr else col(k) + for k, is_expr in zip(flat_keys, key_is_expr_or_series) + ] + expr_flat_keys, kinds = self._flatten_and_extract(*_keys) + + if not all(kind is ExprKind.ELEMENTWISE for kind in kinds): + from narwhals.exceptions import ComputeError + + msg = ( + "Group by is not supported with keys that are not elementwise expressions" + ) + raise ComputeError(msg) + + return GroupBy(self, expr_flat_keys, drop_null_keys=drop_null_keys) + + def sort( + self, + by: str | Iterable[str], + *more_by: str, + descending: bool | Sequence[bool] = False, + nulls_last: bool = False, + ) -> Self: + r"""Sort the dataframe by the given columns. + + Arguments: + by: Column(s) names to sort by. + *more_by: Additional columns to sort by, specified as positional arguments. + descending: Sort in descending order. When sorting by multiple columns, can be + specified per column by passing a sequence of booleans. + nulls_last: Place null values last. + + Returns: + The sorted dataframe. + + Note: + Unlike Polars, it is not possible to specify a sequence of booleans for + `nulls_last` in order to control per-column behaviour. Instead a single + boolean is applied for all `by` columns. + + Examples: + >>> import pandas as pd + >>> import narwhals as nw + >>> df_native = pd.DataFrame( + ... {"foo": [2, 1], "bar": [6.0, 7.0], "ham": ["a", "b"]} + ... ) + >>> nw.from_native(df_native).sort("foo") + ┌──────────────────┐ + |Narwhals DataFrame| + |------------------| + | foo bar ham | + | 1 1 7.0 b | + | 0 2 6.0 a | + └──────────────────┘ + """ + return super().sort(by, *more_by, descending=descending, nulls_last=nulls_last) + + def join( + self, + other: Self, + on: str | list[str] | None = None, + how: JoinStrategy = "inner", + *, + left_on: str | list[str] | None = None, + right_on: str | list[str] | None = None, + suffix: str = "_right", + ) -> Self: + r"""Join in SQL-like fashion. + + Arguments: + other: DataFrame to join with. + on: Name(s) of the join columns in both DataFrames. If set, `left_on` and + `right_on` should be None. + how: Join strategy. + + * *inner*: Returns rows that have matching values in both tables. + * *left*: Returns all rows from the left table, and the matched rows from the right table. + * *full*: Returns all rows in both dataframes, with the suffix appended to the right join keys. + * *cross*: Returns the Cartesian product of rows from both tables. + * *semi*: Filter rows that have a match in the right table. + * *anti*: Filter rows that do not have a match in the right table. + left_on: Join column of the left DataFrame. + right_on: Join column of the right DataFrame. + suffix: Suffix to append to columns with a duplicate name. + + Returns: + A new joined DataFrame + + Examples: + >>> import pandas as pd + >>> import narwhals as nw + >>> df_1_native = pd.DataFrame({"id": ["a", "b"], "price": [6.0, 7.0]}) + >>> df_2_native = pd.DataFrame({"id": ["a", "b", "c"], "qty": [1, 2, 3]}) + >>> nw.from_native(df_1_native).join(nw.from_native(df_2_native), on="id") + ┌──────────────────┐ + |Narwhals DataFrame| + |------------------| + | id price qty | + | 0 a 6.0 1 | + | 1 b 7.0 2 | + └──────────────────┘ + """ + return super().join( + other, how=how, left_on=left_on, right_on=right_on, on=on, suffix=suffix + ) + + def join_asof( + self, + other: Self, + *, + left_on: str | None = None, + right_on: str | None = None, + on: str | None = None, + by_left: str | list[str] | None = None, + by_right: str | list[str] | None = None, + by: str | list[str] | None = None, + strategy: AsofJoinStrategy = "backward", + suffix: str = "_right", + ) -> Self: + """Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than equal keys. + + For Polars, both DataFrames must be sorted by the `on` key (within each `by` group + if specified). + + Arguments: + other: DataFrame to join with. + left_on: Name(s) of the left join column(s). + right_on: Name(s) of the right join column(s). + on: Join column of both DataFrames. If set, left_on and right_on should be None. + by_left: join on these columns before doing asof join. + by_right: join on these columns before doing asof join. + by: join on these columns before doing asof join. + strategy: Join strategy. The default is "backward". + suffix: Suffix to append to columns with a duplicate name. + + * *backward*: selects the last row in the right DataFrame whose "on" key is less than or equal to the left's key. + * *forward*: selects the first row in the right DataFrame whose "on" key is greater than or equal to the left's key. + * *nearest*: search selects the last row in the right DataFrame whose value is nearest to the left's key. + + Returns: + A new joined DataFrame + + Examples: + >>> from datetime import datetime + >>> import pandas as pd + >>> import narwhals as nw + >>> data_gdp = { + ... "datetime": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... datetime(2020, 1, 1), + ... ], + ... "gdp": [4164, 4411, 4566, 4696, 4827], + ... } + >>> data_population = { + ... "datetime": [ + ... datetime(2016, 3, 1), + ... datetime(2018, 8, 1), + ... datetime(2019, 1, 1), + ... ], + ... "population": [82.19, 82.66, 83.12], + ... } + >>> gdp_native = pd.DataFrame(data_gdp) + >>> population_native = pd.DataFrame(data_population) + >>> gdp = nw.from_native(gdp_native) + >>> population = nw.from_native(population_native) + >>> population.join_asof(gdp, on="datetime", strategy="backward") + ┌──────────────────────────────┐ + | Narwhals DataFrame | + |------------------------------| + | datetime population gdp| + |0 2016-03-01 82.19 4164| + |1 2018-08-01 82.66 4566| + |2 2019-01-01 83.12 4696| + └──────────────────────────────┘ + """ + return super().join_asof( + other, + left_on=left_on, + right_on=right_on, + on=on, + by_left=by_left, + by_right=by_right, + by=by, + strategy=strategy, + suffix=suffix, + ) + + # --- descriptive --- + def is_duplicated(self) -> Series[Any]: + r"""Get a mask of all duplicated rows in this DataFrame. + + Returns: + A new Series. + + Examples: + >>> import pandas as pd + >>> import narwhals as nw + >>> df_native = pd.DataFrame({"foo": [2, 2, 2], "bar": [6.0, 6.0, 7.0]}) + >>> nw.from_native(df_native).is_duplicated() + ┌───────────────┐ + |Narwhals Series| + |---------------| + | 0 True | + | 1 True | + | 2 False | + | dtype: bool | + └───────────────┘ + """ + return ~self.is_unique() + + def is_empty(self) -> bool: + r"""Check if the dataframe is empty. + + Returns: + A boolean indicating whether the dataframe is empty (True) or not (False). + + Examples: + >>> import pandas as pd + >>> import narwhals as nw + >>> df_native = pd.DataFrame({"foo": [2, 2, 2], "bar": [6.0, 6.0, 7.0]}) + >>> nw.from_native(df_native).is_empty() + False + """ + return len(self) == 0 + + def is_unique(self) -> Series[Any]: + r"""Get a mask of all unique rows in this DataFrame. + + Returns: + A new Series. + + Examples: + >>> import pandas as pd + >>> import narwhals as nw + >>> df_native = pd.DataFrame({"foo": [2, 2, 2], "bar": [6.0, 6.0, 7.0]}) + >>> nw.from_native(df_native).is_unique() + ┌───────────────┐ + |Narwhals Series| + |---------------| + | 0 False | + | 1 False | + | 2 True | + | dtype: bool | + └───────────────┘ + """ + return self._series(self._compliant_frame.is_unique(), level=self._level) + + def null_count(self) -> Self: + r"""Create a new DataFrame that shows the null counts per column. + + Returns: + A dataframe of shape (1, n_columns). + + Notes: + pandas handles null values differently from Polars and PyArrow. + See [null_handling](../concepts/null_handling.md/) + for reference. + + Examples: + >>> import pyarrow as pa + >>> import narwhals as nw + >>> df_native = pa.table({"foo": [1, None], "bar": [2, 3]}) + >>> nw.from_native(df_native).null_count() + ┌──────────────────┐ + |Narwhals DataFrame| + |------------------| + | pyarrow.Table | + | foo: int64 | + | bar: int64 | + | ---- | + | foo: [[1]] | + | bar: [[0]] | + └──────────────────┘ + """ + plx = self._compliant_frame.__narwhals_namespace__() + result = self._compliant_frame.select(plx.all().null_count()) + return self._with_compliant(result) + + def item(self, row: int | None = None, column: int | str | None = None) -> Any: + r"""Return the DataFrame as a scalar, or return the element at the given row/column. + + Arguments: + row: The *n*-th row. + column: The column selected via an integer or a string (column name). + + Returns: + A scalar or the specified element in the dataframe. + + Notes: + If row/col not provided, this is equivalent to df[0,0], with a check that the shape is (1,1). + With row/col, this is equivalent to df[row,col]. + + Examples: + >>> import pyarrow as pa + >>> import narwhals as nw + >>> df_native = pa.table({"foo": [1, None], "bar": [2, 3]}) + >>> nw.from_native(df_native).item(0, 1) + 2 + """ + return self._compliant_frame.item(row=row, column=column) + + def clone(self) -> Self: + r"""Create a copy of this DataFrame. + + Returns: + An identical copy of the original dataframe. + """ + return self._with_compliant(self._compliant_frame.clone()) + + def gather_every(self, n: int, offset: int = 0) -> Self: + r"""Take every nth row in the DataFrame and return as a new DataFrame. + + Arguments: + n: Gather every *n*-th row. + offset: Starting index. + + Returns: + The dataframe containing only the selected rows. + + Examples: + >>> import pyarrow as pa + >>> import narwhals as nw + >>> df_native = pa.table({"foo": [1, None, 2, 3]}) + >>> nw.from_native(df_native).gather_every(2) + ┌──────────────────┐ + |Narwhals DataFrame| + |------------------| + | pyarrow.Table | + | foo: int64 | + | ---- | + | foo: [[1,2]] | + └──────────────────┘ + """ + return super().gather_every(n=n, offset=offset) + + def pivot( + self, + on: str | list[str], + *, + index: str | list[str] | None = None, + values: str | list[str] | None = None, + aggregate_function: PivotAgg | None = None, + maintain_order: bool | None = None, + sort_columns: bool = False, + separator: str = "_", + ) -> Self: + r"""Create a spreadsheet-style pivot table as a DataFrame. + + Arguments: + on: Name of the column(s) whose values will be used as the header of the + output DataFrame. + index: One or multiple keys to group by. If None, all remaining columns not + specified on `on` and `values` will be used. At least one of `index` and + `values` must be specified. + values: One or multiple keys to group by. If None, all remaining columns not + specified on `on` and `index` will be used. At least one of `index` and + `values` must be specified. + aggregate_function: Choose from + + - None: no aggregation takes place, will raise error if multiple values + are in group. + - A predefined aggregate function string, one of + {'min', 'max', 'first', 'last', 'sum', 'mean', 'median', 'len'} + maintain_order: Has no effect and is kept around only for backwards-compatibility. + sort_columns: Sort the transposed columns by name. Default is by order of + discovery. + separator: Used as separator/delimiter in generated column names in case of + multiple `values` columns. + + Returns: + A new dataframe. + + Examples: + >>> import pandas as pd + >>> import narwhals as nw + >>> data = { + ... "ix": [1, 1, 2, 2, 1, 2], + ... "col": ["a", "a", "a", "a", "b", "b"], + ... "foo": [0, 1, 2, 2, 7, 1], + ... "bar": [0, 2, 0, 0, 9, 4], + ... } + >>> df_native = pd.DataFrame(data) + >>> nw.from_native(df_native).pivot( + ... "col", index="ix", aggregate_function="sum" + ... ) + ┌─────────────────────────────────┐ + | Narwhals DataFrame | + |---------------------------------| + | ix foo_a foo_b bar_a bar_b| + |0 1 1 7 2 9| + |1 2 4 1 0 4| + └─────────────────────────────────┘ + """ + if values is None and index is None: + msg = "At least one of `values` and `index` must be passed" + raise ValueError(msg) + if maintain_order is not None: + msg = ( + "`maintain_order` has no effect and is only kept around for backwards-compatibility. " + "You can safely remove this argument." + ) + warn(message=msg, category=UserWarning, stacklevel=find_stacklevel()) + on = [on] if isinstance(on, str) else on + values = [values] if isinstance(values, str) else values + index = [index] if isinstance(index, str) else index + + return self._with_compliant( + self._compliant_frame.pivot( + on=on, + index=index, + values=values, + aggregate_function=aggregate_function, + sort_columns=sort_columns, + separator=separator, + ) + ) + + def to_arrow(self) -> pa.Table: + r"""Convert to arrow table. + + Returns: + A new PyArrow table. + + Examples: + >>> import pandas as pd + >>> import narwhals as nw + >>> df_native = pd.DataFrame({"foo": [1, None], "bar": [2, 3]}) + >>> nw.from_native(df_native).to_arrow() + pyarrow.Table + foo: double + bar: int64 + ---- + foo: [[1,null]] + bar: [[2,3]] + """ + return self._compliant_frame.to_arrow() + + def sample( + self, + n: int | None = None, + *, + fraction: float | None = None, + with_replacement: bool = False, + seed: int | None = None, + ) -> Self: + r"""Sample from this DataFrame. + + Arguments: + n: Number of items to return. Cannot be used with fraction. + fraction: Fraction of items to return. Cannot be used with n. + with_replacement: Allow values to be sampled more than once. + seed: Seed for the random number generator. If set to None (default), a random + seed is generated for each sample operation. + + Returns: + A new dataframe. + + Notes: + The results may not be consistent across libraries. + + Examples: + >>> import pandas as pd + >>> import narwhals as nw + >>> df_native = pd.DataFrame({"foo": [1, 2, 3], "bar": [19, 32, 4]}) + >>> nw.from_native(df_native).sample(n=2) # doctest:+SKIP + ┌──────────────────┐ + |Narwhals DataFrame| + |------------------| + | foo bar | + | 2 3 4 | + | 1 2 32 | + └──────────────────┘ + """ + return self._with_compliant( + self._compliant_frame.sample( + n=n, fraction=fraction, with_replacement=with_replacement, seed=seed + ) + ) + + def unpivot( + self, + on: str | list[str] | None = None, + *, + index: str | list[str] | None = None, + variable_name: str = "variable", + value_name: str = "value", + ) -> Self: + r"""Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (index) while all other columns, considered + measured variables (on), are "unpivoted" to the row axis leaving just + two non-identifier columns, 'variable' and 'value'. + + Arguments: + on: Column(s) to use as values variables; if `on` is empty all columns that + are not in `index` will be used. + index: Column(s) to use as identifier variables. + variable_name: Name to give to the `variable` column. Defaults to "variable". + value_name: Name to give to the `value` column. Defaults to "value". + + Returns: + The unpivoted dataframe. + + Notes: + If you're coming from pandas, this is similar to `pandas.DataFrame.melt`, + but with `index` replacing `id_vars` and `on` replacing `value_vars`. + In other frameworks, you might know this operation as `pivot_longer`. + + Examples: + >>> import pandas as pd + >>> import narwhals as nw + >>> data = {"a": ["x", "y", "z"], "b": [1, 3, 5], "c": [2, 4, 6]} + >>> df_native = pd.DataFrame(data) + >>> nw.from_native(df_native).unpivot(["b", "c"], index="a") + ┌────────────────────┐ + | Narwhals DataFrame | + |--------------------| + | a variable value| + |0 x b 1| + |1 y b 3| + |2 z b 5| + |3 x c 2| + |4 y c 4| + |5 z c 6| + └────────────────────┘ + """ + return super().unpivot( + on=on, index=index, variable_name=variable_name, value_name=value_name + ) + + def explode(self, columns: str | Sequence[str], *more_columns: str) -> Self: + """Explode the dataframe to long format by exploding the given columns. + + Notes: + It is possible to explode multiple columns only if these columns must have + matching element counts. + + Arguments: + columns: Column names. The underlying columns being exploded must be of the `List` data type. + *more_columns: Additional names of columns to explode, specified as positional arguments. + + Returns: + New DataFrame + + Examples: + >>> import polars as pl + >>> import narwhals as nw + >>> data = {"a": ["x", "y"], "b": [[1, 2], [3]]} + >>> df_native = pl.DataFrame(data) + >>> nw.from_native(df_native).explode("b").to_native() + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ x ┆ 1 │ + │ x ┆ 2 │ + │ y ┆ 3 │ + └─────┴─────┘ + """ + return super().explode(columns, *more_columns) + + +class LazyFrame(BaseFrame[FrameT]): + """Narwhals LazyFrame, backed by a native lazyframe. + + Warning: + This class is not meant to be instantiated directly - instead use + [`narwhals.from_native`][] with a native + object that is a lazy dataframe from one of the supported + backend (e.g. polars.LazyFrame, dask_expr._collection.DataFrame): + ```py + narwhals.from_native(native_lazyframe) + ``` + """ + + def _extract_compliant(self, arg: Any) -> Any: + from narwhals.expr import Expr + from narwhals.series import Series + + if isinstance(arg, BaseFrame): + return arg._compliant_frame + if isinstance(arg, Series): # pragma: no cover + msg = "Binary operations between Series and LazyFrame are not supported." + raise TypeError(msg) + if isinstance(arg, str): # pragma: no cover + plx = self.__narwhals_namespace__() + return plx.col(arg) + if isinstance(arg, Expr): + if arg._metadata.n_orderable_ops: + msg = ( + "Order-dependent expressions are not supported for use in LazyFrame.\n\n" + "Hint: To make the expression valid, use `.over` with `order_by` specified.\n\n" + "For example, if you wrote `nw.col('price').cum_sum()` and you have a column\n" + "`'date'` which orders your data, then replace:\n\n" + " nw.col('price').cum_sum()\n\n" + " with:\n\n" + " nw.col('price').cum_sum().over(order_by='date')\n" + " ^^^^^^^^^^^^^^^^^^^^^^\n\n" + "See https://narwhals-dev.github.io/narwhals/concepts/order_dependence/." + ) + raise OrderDependentExprError(msg) + if arg._metadata.is_filtration: + msg = ( + "Length-changing expressions are not supported for use in LazyFrame, unless\n" + "followed by an aggregation.\n\n" + "Hints:\n" + "- Instead of `lf.select(nw.col('a').head())`, use `lf.select('a').head()\n" + "- Instead of `lf.select(nw.col('a').drop_nulls()).select(nw.sum('a'))`,\n" + " use `lf.select(nw.col('a').drop_nulls().sum())\n" + ) + raise LengthChangingExprError(msg) + return arg._to_compliant_expr(self.__narwhals_namespace__()) + if get_polars() is not None and "polars" in str(type(arg)): # pragma: no cover + msg = ( + f"Expected Narwhals object, got: {type(arg)}.\n\n" + "Perhaps you:\n" + "- Forgot a `nw.from_native` somewhere?\n" + "- Used `pl.col` instead of `nw.col`?" + ) + raise TypeError(msg) + raise InvalidIntoExprError.from_invalid_type(type(arg)) # pragma: no cover + + @property + def _dataframe(self) -> type[DataFrame[Any]]: + return DataFrame + + def __init__(self, df: Any, *, level: Literal["full", "lazy", "interchange"]) -> None: + self._level = level + self._compliant_frame: CompliantLazyFrame[Any, FrameT, Self] # type: ignore[type-var] + if is_compliant_lazyframe(df): + self._compliant_frame = df.__narwhals_lazyframe__() + else: # pragma: no cover + msg = f"Expected Polars LazyFrame or an object that implements `__narwhals_lazyframe__`, got: {type(df)}" + raise AssertionError(msg) + + def __repr__(self) -> str: # pragma: no cover + return generate_repr("Narwhals LazyFrame", self.to_native().__repr__()) + + @property + def implementation(self) -> Implementation: + """Return implementation of native frame. + + This can be useful when you need to use special-casing for features outside of + Narwhals' scope - for example, when dealing with pandas' Period Dtype. + + Returns: + Implementation. + + Examples: + >>> import narwhals as nw + >>> import dask.dataframe as dd + >>> lf_native = dd.from_dict({"a": [1, 2]}, npartitions=1) + >>> nw.from_native(lf_native).implementation + + """ + return self._compliant_frame._implementation + + def __getitem__(self, item: str | slice) -> NoReturn: + msg = "Slicing is not supported on LazyFrame" + raise TypeError(msg) + + def collect( + self, backend: ModuleType | Implementation | str | None = None, **kwargs: Any + ) -> DataFrame[Any]: + r"""Materialize this LazyFrame into a DataFrame. + + As each underlying lazyframe has different arguments to set when materializing + the lazyframe into a dataframe, we allow to pass them as kwargs (see examples + below for how to generalize the specification). + + Arguments: + backend: specifies which eager backend collect to. This will be the underlying + backend for the resulting Narwhals DataFrame. If None, then the following + default conversions will be applied + + - `polars.LazyFrame` -> `polars.DataFrame` + - `dask.DataFrame` -> `pandas.DataFrame` + - `duckdb.PyRelation` -> `pyarrow.Table` + - `pyspark.DataFrame` -> `pyarrow.Table` + + `backend` can be specified in various ways + + - As `Implementation.` with `BACKEND` being `PANDAS`, `PYARROW` + or `POLARS`. + - As a string: `"pandas"`, `"pyarrow"` or `"polars"` + - Directly as a module `pandas`, `pyarrow` or `polars`. + kwargs: backend specific kwargs to pass along. To know more please check the + backend specific documentation + + - [polars.LazyFrame.collect](https://docs.pola.rs/api/python/dev/reference/lazyframe/api/polars.LazyFrame.collect.html) + - [dask.dataframe.DataFrame.compute](https://docs.dask.org/en/stable/generated/dask.dataframe.DataFrame.compute.html) + + Returns: + DataFrame + + Examples: + >>> import duckdb + >>> import narwhals as nw + >>> lf_native = duckdb.sql("SELECT * FROM VALUES (1, 2), (3, 4) df(a, b)") + >>> lf = nw.from_native(lf_native) + >>> lf + ┌──────────────────┐ + |Narwhals LazyFrame| + |------------------| + |┌───────┬───────┐ | + |│ a │ b │ | + |│ int32 │ int32 │ | + |├───────┼───────┤ | + |│ 1 │ 2 │ | + |│ 3 │ 4 │ | + |└───────┴───────┘ | + └──────────────────┘ + >>> lf.collect() + ┌──────────────────┐ + |Narwhals DataFrame| + |------------------| + | pyarrow.Table | + | a: int32 | + | b: int32 | + | ---- | + | a: [[1,3]] | + | b: [[2,4]] | + └──────────────────┘ + """ + eager_backend = None if backend is None else Implementation.from_backend(backend) + supported_eager_backends = ( + Implementation.POLARS, + Implementation.PANDAS, + Implementation.PYARROW, + ) + if eager_backend is not None and eager_backend not in supported_eager_backends: + msg = f"Unsupported `backend` value.\nExpected one of {supported_eager_backends} or None, got: {eager_backend}." + raise ValueError(msg) + return self._dataframe( + self._compliant_frame.collect(backend=eager_backend, **kwargs), level="full" + ) + + def to_native(self) -> FrameT: + """Convert Narwhals LazyFrame to native one. + + Returns: + Object of class that user started with. + + Examples: + >>> import duckdb + >>> import narwhals as nw + >>> lf_native = duckdb.sql("SELECT * FROM VALUES (1, 2), (3, 4) df(a, b)") + >>> nw.from_native(lf_native).to_native() + ┌───────┬───────┐ + │ a │ b │ + │ int32 │ int32 │ + ├───────┼───────┤ + │ 1 │ 2 │ + │ 3 │ 4 │ + └───────┴───────┘ + + """ + return to_native(narwhals_object=self, pass_through=False) + + # inherited + def pipe( + self, + function: Callable[Concatenate[Self, PS], R], + *args: PS.args, + **kwargs: PS.kwargs, + ) -> R: + """Pipe function call. + + Arguments: + function: Function to apply. + args: Positional arguments to pass to function. + kwargs: Keyword arguments to pass to function. + + Returns: + The original object with the function applied. + + Examples: + >>> import duckdb + >>> import narwhals as nw + >>> lf_native = duckdb.sql("SELECT * FROM VALUES (1, 2), (3, 4) df(a, b)") + >>> nw.from_native(lf_native).pipe(lambda x: x.select("a")).to_native() + ┌───────┐ + │ a │ + │ int32 │ + ├───────┤ + │ 1 │ + │ 3 │ + └───────┘ + + """ + return super().pipe(function, *args, **kwargs) + + def drop_nulls(self, subset: str | list[str] | None = None) -> Self: + """Drop rows that contain null values. + + Arguments: + subset: Column name(s) for which null values are considered. If set to None + (default), use all columns. + + Returns: + The original object with the rows removed that contained the null values. + + Notes: + pandas handles null values differently from Polars and PyArrow. + See [null_handling](../concepts/null_handling.md/) + for reference. + + Examples: + >>> import duckdb + >>> import narwhals as nw + >>> lf_native = duckdb.sql("SELECT * FROM VALUES (1, NULL), (3, 4) df(a, b)") + >>> nw.from_native(lf_native).drop_nulls() + ┌──────────────────┐ + |Narwhals LazyFrame| + |------------------| + |┌───────┬───────┐ | + |│ a │ b │ | + |│ int32 │ int32 │ | + |├───────┼───────┤ | + |│ 3 │ 4 │ | + |└───────┴───────┘ | + └──────────────────┘ + """ + return super().drop_nulls(subset=subset) + + def with_row_index(self, name: str = "index") -> Self: + """Insert column which enumerates rows. + + Arguments: + name: The name of the column as a string. The default is "index". + + Returns: + The original object with the column added. + + Examples: + >>> import dask.dataframe as dd + >>> import narwhals as nw + >>> lf_native = dd.from_dict({"a": [1, 2], "b": [4, 5]}, npartitions=1) + >>> nw.from_native(lf_native).with_row_index().collect() + ┌──────────────────┐ + |Narwhals DataFrame| + |------------------| + | index a b | + | 0 0 1 4 | + | 1 1 2 5 | + └──────────────────┘ + """ + return super().with_row_index(name) + + @property + def schema(self) -> Schema: + r"""Get an ordered mapping of column names to their data type. + + Returns: + A Narwhals Schema object that displays the mapping of column names. + + Examples: + >>> import duckdb + >>> import narwhals as nw + >>> lf_native = duckdb.sql("SELECT * FROM VALUES (1, 4.5), (3, 2.) df(a, b)") + >>> nw.from_native(lf_native).schema + Schema({'a': Int32, 'b': Decimal}) + """ + return super().schema + + def collect_schema(self) -> Schema: + r"""Get an ordered mapping of column names to their data type. + + Returns: + A Narwhals Schema object that displays the mapping of column names. + + Examples: + >>> import duckdb + >>> import narwhals as nw + >>> lf_native = duckdb.sql("SELECT * FROM VALUES (1, 4.5), (3, 2.) df(a, b)") + >>> nw.from_native(lf_native).collect_schema() + Schema({'a': Int32, 'b': Decimal}) + """ + return super().collect_schema() + + @property + def columns(self) -> list[str]: + r"""Get column names. + + Returns: + The column names stored in a list. + + Examples: + >>> import duckdb + >>> import narwhals as nw + >>> lf_native = duckdb.sql("SELECT * FROM VALUES (1, 4.5), (3, 2.) df(a, b)") + >>> nw.from_native(lf_native).columns + ['a', 'b'] + """ + return super().columns + + def with_columns( + self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr + ) -> Self: + r"""Add columns to this LazyFrame. + + Added columns will replace existing columns with the same name. + + Arguments: + *exprs: Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + + **named_exprs: Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns: + LazyFrame: A new LazyFrame with the columns added. + + Note: + Creating a new LazyFrame using this method does not create a new copy of + existing data. + + Examples: + >>> import duckdb + >>> import narwhals as nw + >>> lf_native = duckdb.sql("SELECT * FROM VALUES (1, 4.5), (3, 2.) df(a, b)") + >>> nw.from_native(lf_native).with_columns(c=nw.col("a") + 1) + ┌────────────────────────────────┐ + | Narwhals LazyFrame | + |--------------------------------| + |┌───────┬──────────────┬───────┐| + |│ a │ b │ c │| + |│ int32 │ decimal(2,1) │ int32 │| + |├───────┼──────────────┼───────┤| + |│ 1 │ 4.5 │ 2 │| + |│ 3 │ 2.0 │ 4 │| + |└───────┴──────────────┴───────┘| + └────────────────────────────────┘ + """ + if not exprs and not named_exprs: + msg = "At least one expression must be passed to LazyFrame.with_columns" + raise ValueError(msg) + return super().with_columns(*exprs, **named_exprs) + + def select( + self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr + ) -> Self: + r"""Select columns from this LazyFrame. + + Arguments: + *exprs: Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names. + **named_exprs: Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns: + The LazyFrame containing only the selected columns. + + Notes: + If you'd like to select a column whose name isn't a string (for example, + if you're working with pandas) then you should explicitly use `nw.col` instead + of just passing the column name. For example, to select a column named + `0` use `df.select(nw.col(0))`, not `df.select(0)`. + + Examples: + >>> import duckdb + >>> import narwhals as nw + >>> lf_native = duckdb.sql("SELECT * FROM VALUES (1, 4.5), (3, 2.) df(a, b)") + >>> nw.from_native(lf_native).select("a", a_plus_1=nw.col("a") + 1) + ┌────────────────────┐ + | Narwhals LazyFrame | + |--------------------| + |┌───────┬──────────┐| + |│ a │ a_plus_1 │| + |│ int32 │ int32 │| + |├───────┼──────────┤| + |│ 1 │ 2 │| + |│ 3 │ 4 │| + |└───────┴──────────┘| + └────────────────────┘ + """ + if not exprs and not named_exprs: + msg = "At least one expression must be passed to LazyFrame.select" + raise ValueError(msg) + return super().select(*exprs, **named_exprs) + + def rename(self, mapping: dict[str, str]) -> Self: + r"""Rename column names. + + Arguments: + mapping: Key value pairs that map from old name to new name, or a + function that takes the old name as input and returns the + new name. + + Returns: + The LazyFrame with the specified columns renamed. + + Examples: + >>> import duckdb + >>> import narwhals as nw + >>> lf_native = duckdb.sql("SELECT * FROM VALUES (1, 4.5), (3, 2.) df(a, b)") + >>> nw.from_native(lf_native).rename({"a": "c"}) + ┌────────────────────────┐ + | Narwhals LazyFrame | + |------------------------| + |┌───────┬──────────────┐| + |│ c │ b │| + |│ int32 │ decimal(2,1) │| + |├───────┼──────────────┤| + |│ 1 │ 4.5 │| + |│ 3 │ 2.0 │| + |└───────┴──────────────┘| + └────────────────────────┘ + """ + return super().rename(mapping) + + def head(self, n: int = 5) -> Self: + r"""Get `n` rows. + + Arguments: + n: Number of rows to return. + + Returns: + A subset of the LazyFrame of shape (n, n_columns). + + Examples: + >>> import dask.dataframe as dd + >>> import narwhals as nw + >>> lf_native = dd.from_dict({"a": [1, 2, 3], "b": [4, 5, 6]}, npartitions=1) + >>> nw.from_native(lf_native).head(2).collect() + ┌──────────────────┐ + |Narwhals DataFrame| + |------------------| + | a b | + | 0 1 4 | + | 1 2 5 | + └──────────────────┘ + """ + return super().head(n) + + def tail(self, n: int = 5) -> Self: # pragma: no cover + r"""Get the last `n` rows. + + Warning: + `LazyFrame.tail` is deprecated and will be removed in a future version. + Note: this will remain available in `narwhals.stable.v1`. + See [stable api](../backcompat.md/) for more information. + + Arguments: + n: Number of rows to return. + + Returns: + A subset of the LazyFrame of shape (n, n_columns). + """ + return super().tail(n) + + def drop(self, *columns: str | Iterable[str], strict: bool = True) -> Self: + r"""Remove columns from the LazyFrame. + + Arguments: + *columns: Names of the columns that should be removed from the dataframe. + strict: Validate that all column names exist in the schema and throw an + exception if a column name does not exist in the schema. + + Returns: + The LazyFrame with the specified columns removed. + + Warning: + `strict` argument is ignored for `polars<1.0.0`. + + Please consider upgrading to a newer version or pass to eager mode. + + Examples: + >>> import duckdb + >>> import narwhals as nw + >>> lf_native = duckdb.sql("SELECT * FROM VALUES (1, 2), (3, 4) df(a, b)") + >>> nw.from_native(lf_native).drop("a").to_native() + ┌───────┐ + │ b │ + │ int32 │ + ├───────┤ + │ 2 │ + │ 4 │ + └───────┘ + + """ + return super().drop(*flatten(columns), strict=strict) + + def unique( + self, + subset: str | list[str] | None = None, + *, + keep: LazyUniqueKeepStrategy = "any", + ) -> Self: + """Drop duplicate rows from this LazyFrame. + + Arguments: + subset: Column name(s) to consider when identifying duplicate rows. + If set to `None`, use all columns. + keep: {'any', 'none'} + Which of the duplicate rows to keep. + + * 'any': Does not give any guarantee of which row is kept. + * 'none': Don't keep duplicate rows. + + Returns: + The LazyFrame with unique rows. + + Examples: + >>> import duckdb + >>> import narwhals as nw + >>> lf_native = duckdb.sql("SELECT * FROM VALUES (1, 1), (3, 4) df(a, b)") + >>> nw.from_native(lf_native).unique("a").sort("a", descending=True) + ┌──────────────────┐ + |Narwhals LazyFrame| + |------------------| + |┌───────┬───────┐ | + |│ a │ b │ | + |│ int32 │ int32 │ | + |├───────┼───────┤ | + |│ 3 │ 4 │ | + |│ 1 │ 1 │ | + |└───────┴───────┘ | + └──────────────────┘ + """ + if keep not in {"any", "none"}: + msg = ( + "narwhals.LazyFrame makes no assumptions about row order, so only " + f"'any' and 'none' are supported for `keep` in `unique`. Got: {keep}." + ) + raise ValueError(msg) + if isinstance(subset, str): + subset = [subset] + return self._with_compliant( + self._compliant_frame.unique(subset=subset, keep=keep) + ) + + def filter( + self, *predicates: IntoExpr | Iterable[IntoExpr] | list[bool], **constraints: Any + ) -> Self: + r"""Filter the rows in the LazyFrame based on a predicate expression. + + The original order of the remaining rows is preserved. + + Arguments: + *predicates: Expression that evaluates to a boolean Series. Can + also be a (single!) boolean list. + **constraints: Column filters; use `name = value` to filter columns by the supplied value. + Each constraint will behave the same as `nw.col(name).eq(value)`, and will be implicitly + joined with the other filter conditions using &. + + Returns: + The filtered LazyFrame. + + Examples: + >>> import duckdb + >>> import narwhals as nw + >>> df_native = duckdb.sql(''' + ... SELECT * FROM VALUES + ... (1, 6, 'a'), + ... (2, 7, 'b'), + ... (3, 8, 'c') + ... df(foo, bar, ham) + ... ''') + + Filter on one condition + + >>> nw.from_native(df_native).filter(nw.col("foo") > 1).to_native() + ┌───────┬───────┬─────────┐ + │ foo │ bar │ ham │ + │ int32 │ int32 │ varchar │ + ├───────┼───────┼─────────┤ + │ 2 │ 7 │ b │ + │ 3 │ 8 │ c │ + └───────┴───────┴─────────┘ + + + Filter on multiple conditions with implicit `&` + + >>> nw.from_native(df_native).filter( + ... nw.col("foo") < 3, nw.col("ham") == "a" + ... ).to_native() + ┌───────┬───────┬─────────┐ + │ foo │ bar │ ham │ + │ int32 │ int32 │ varchar │ + ├───────┼───────┼─────────┤ + │ 1 │ 6 │ a │ + └───────┴───────┴─────────┘ + + + Filter on multiple conditions with `|` + + >>> nw.from_native(df_native).filter( + ... (nw.col("foo") == 1) | (nw.col("ham") == "c") + ... ).to_native() + ┌───────┬───────┬─────────┐ + │ foo │ bar │ ham │ + │ int32 │ int32 │ varchar │ + ├───────┼───────┼─────────┤ + │ 1 │ 6 │ a │ + │ 3 │ 8 │ c │ + └───────┴───────┴─────────┘ + + + Filter using `**kwargs` syntax + + >>> nw.from_native(df_native).filter(foo=2, ham="b").to_native() + ┌───────┬───────┬─────────┐ + │ foo │ bar │ ham │ + │ int32 │ int32 │ varchar │ + ├───────┼───────┼─────────┤ + │ 2 │ 7 │ b │ + └───────┴───────┴─────────┘ + + """ + if ( + len(predicates) == 1 and is_list_of(predicates[0], bool) and not constraints + ): # pragma: no cover + msg = "`LazyFrame.filter` is not supported with Python boolean masks - use expressions instead." + raise TypeError(msg) + + return super().filter(*predicates, **constraints) + + @overload + def group_by( + self, *keys: IntoExpr | Iterable[IntoExpr], drop_null_keys: Literal[False] = ... + ) -> LazyGroupBy[Self]: ... + + @overload + def group_by( + self, *keys: str | Iterable[str], drop_null_keys: Literal[True] + ) -> LazyGroupBy[Self]: ... + + def group_by( + self, *keys: IntoExpr | Iterable[IntoExpr], drop_null_keys: bool = False + ) -> LazyGroupBy[Self]: + r"""Start a group by operation. + + Arguments: + *keys: Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + drop_null_keys: if True, then groups where any key is null won't be + included in the result. + + Returns: + Object which can be used to perform aggregations. + + Examples: + >>> import duckdb + >>> import narwhals as nw + >>> df_native = duckdb.sql( + ... "SELECT * FROM VALUES (1, 'a'), (2, 'b'), (3, 'a') df(a, b)" + ... ) + >>> df = nw.from_native(df_native) + >>> df.group_by("b").agg(nw.col("a").sum()).sort("b").to_native() + ┌─────────┬────────┐ + │ b │ a │ + │ varchar │ int128 │ + ├─────────┼────────┤ + │ a │ 4 │ + │ b │ 2 │ + └─────────┴────────┘ + + + Expressions are also accepted. + + >>> df.group_by(nw.col("b").str.len_chars()).agg( + ... nw.col("a").sum() + ... ).to_native() + ┌───────┬────────┐ + │ b │ a │ + │ int64 │ int128 │ + ├───────┼────────┤ + │ 1 │ 6 │ + └───────┴────────┘ + + """ + from narwhals.group_by import LazyGroupBy + + flat_keys = flatten(keys) + + if all(isinstance(key, str) for key in flat_keys): + return LazyGroupBy(self, flat_keys, drop_null_keys=drop_null_keys) + + from narwhals import col + from narwhals.expr import Expr + + key_is_expr = tuple(isinstance(k, Expr) for k in flat_keys) + + if drop_null_keys and any(key_is_expr): + msg = "drop_null_keys cannot be True when keys contains Expr" + raise NotImplementedError(msg) + + _keys = [k if is_expr else col(k) for k, is_expr in zip(flat_keys, key_is_expr)] + expr_flat_keys, kinds = self._flatten_and_extract(*_keys) + + if not all(kind is ExprKind.ELEMENTWISE for kind in kinds): + from narwhals.exceptions import ComputeError + + msg = ( + "Group by is not supported with keys that are not elementwise expressions" + ) + raise ComputeError(msg) + + return LazyGroupBy(self, expr_flat_keys, drop_null_keys=drop_null_keys) + + def sort( + self, + by: str | Iterable[str], + *more_by: str, + descending: bool | Sequence[bool] = False, + nulls_last: bool = False, + ) -> Self: + r"""Sort the LazyFrame by the given columns. + + Arguments: + by: Column(s) names to sort by. + *more_by: Additional columns to sort by, specified as positional arguments. + descending: Sort in descending order. When sorting by multiple columns, can be + specified per column by passing a sequence of booleans. + nulls_last: Place null values last; can specify a single boolean applying to + all columns or a sequence of booleans for per-column control. + + Returns: + The sorted LazyFrame. + + Warning: + Unlike Polars, it is not possible to specify a sequence of booleans for + `nulls_last` in order to control per-column behaviour. Instead a single + boolean is applied for all `by` columns. + + Examples: + >>> import duckdb + >>> import narwhals as nw + >>> df_native = duckdb.sql( + ... "SELECT * FROM VALUES (1, 6.0, 'a'), (2, 5.0, 'c'), (NULL, 4.0, 'b') df(a, b, c)" + ... ) + >>> df = nw.from_native(df_native) + >>> df.sort("a") + ┌──────────────────────────────────┐ + | Narwhals LazyFrame | + |----------------------------------| + |┌───────┬──────────────┬─────────┐| + |│ a │ b │ c │| + |│ int32 │ decimal(2,1) │ varchar │| + |├───────┼──────────────┼─────────┤| + |│ NULL │ 4.0 │ b │| + |│ 1 │ 6.0 │ a │| + |│ 2 │ 5.0 │ c │| + |└───────┴──────────────┴─────────┘| + └──────────────────────────────────┘ + """ + return super().sort(by, *more_by, descending=descending, nulls_last=nulls_last) + + def join( + self, + other: Self, + on: str | list[str] | None = None, + how: JoinStrategy = "inner", + *, + left_on: str | list[str] | None = None, + right_on: str | list[str] | None = None, + suffix: str = "_right", + ) -> Self: + r"""Add a join operation to the Logical Plan. + + Arguments: + other: Lazy DataFrame to join with. + on: Name(s) of the join columns in both DataFrames. If set, `left_on` and + `right_on` should be None. + how: Join strategy. + + * *inner*: Returns rows that have matching values in both tables. + * *left*: Returns all rows from the left table, and the matched rows from the right table. + * *full*: Returns all rows in both dataframes, with the suffix appended to the right join keys. + * *cross*: Returns the Cartesian product of rows from both tables. + * *semi*: Filter rows that have a match in the right table. + * *anti*: Filter rows that do not have a match in the right table. + left_on: Join column of the left DataFrame. + right_on: Join column of the right DataFrame. + suffix: Suffix to append to columns with a duplicate name. + + Returns: + A new joined LazyFrame. + + Examples: + >>> import duckdb + >>> import narwhals as nw + >>> df_native1 = duckdb.sql( + ... "SELECT * FROM VALUES (1, 'a'), (2, 'b') df(a, b)" + ... ) + >>> df_native2 = duckdb.sql( + ... "SELECT * FROM VALUES (1, 'x'), (3, 'y') df(a, c)" + ... ) + >>> df1 = nw.from_native(df_native1) + >>> df2 = nw.from_native(df_native2) + >>> df1.join(df2, on="a") + ┌─────────────────────────────┐ + | Narwhals LazyFrame | + |-----------------------------| + |┌───────┬─────────┬─────────┐| + |│ a │ b │ c │| + |│ int32 │ varchar │ varchar │| + |├───────┼─────────┼─────────┤| + |│ 1 │ a │ x │| + |└───────┴─────────┴─────────┘| + └─────────────────────────────┘ + """ + return super().join( + other, how=how, left_on=left_on, right_on=right_on, on=on, suffix=suffix + ) + + def join_asof( + self, + other: Self, + *, + left_on: str | None = None, + right_on: str | None = None, + on: str | None = None, + by_left: str | list[str] | None = None, + by_right: str | list[str] | None = None, + by: str | list[str] | None = None, + strategy: AsofJoinStrategy = "backward", + suffix: str = "_right", + ) -> Self: + """Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than equal keys. + + For Polars, both DataFrames must be sorted by the `on` key (within each `by` group + if specified). + + Arguments: + other: DataFrame to join with. + left_on: Name(s) of the left join column(s). + right_on: Name(s) of the right join column(s). + on: Join column of both DataFrames. If set, left_on and right_on should be None. + by_left: join on these columns before doing asof join + by_right: join on these columns before doing asof join + by: join on these columns before doing asof join + strategy: Join strategy. The default is "backward". + + * *backward*: selects the last row in the right DataFrame whose "on" key is less than or equal to the left's key. + * *forward*: selects the first row in the right DataFrame whose "on" key is greater than or equal to the left's key. + * *nearest*: search selects the last row in the right DataFrame whose value is nearest to the left's key. + + suffix: Suffix to append to columns with a duplicate name. + + Returns: + A new joined LazyFrame. + + Examples: + >>> from datetime import datetime + >>> import polars as pl + >>> import narwhals as nw + >>> data_gdp = { + ... "datetime": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... datetime(2020, 1, 1), + ... ], + ... "gdp": [4164, 4411, 4566, 4696, 4827], + ... } + >>> data_population = { + ... "datetime": [ + ... datetime(2016, 3, 1), + ... datetime(2018, 8, 1), + ... datetime(2019, 1, 1), + ... ], + ... "population": [82.19, 82.66, 83.12], + ... } + >>> gdp_native = pl.DataFrame(data_gdp) + >>> population_native = pl.DataFrame(data_population) + >>> gdp = nw.from_native(gdp_native) + >>> population = nw.from_native(population_native) + >>> population.join_asof(gdp, on="datetime", strategy="backward").to_native() + shape: (3, 3) + ┌─────────────────────┬────────────┬──────┐ + │ datetime ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-03-01 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2018-08-01 00:00:00 ┆ 82.66 ┆ 4566 │ + │ 2019-01-01 00:00:00 ┆ 83.12 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + """ + return super().join_asof( + other, + left_on=left_on, + right_on=right_on, + on=on, + by_left=by_left, + by_right=by_right, + by=by, + strategy=strategy, + suffix=suffix, + ) + + def lazy(self) -> Self: + """Restrict available API methods to lazy-only ones. + + This is a no-op, and exists only for compatibility with `DataFrame.lazy`. + + Returns: + A LazyFrame. + """ + return self + + def gather_every(self, n: int, offset: int = 0) -> Self: + r"""Take every nth row in the DataFrame and return as a new DataFrame. + + Warning: + `LazyFrame.gather_every` is deprecated and will be removed in a future version. + Note: this will remain available in `narwhals.stable.v1`. + See [stable api](../backcompat.md/) for more information. + + Arguments: + n: Gather every *n*-th row. + offset: Starting index. + + Returns: + The LazyFrame containing only the selected rows. + """ + msg = ( + "`LazyFrame.gather_every` is deprecated and will be removed in a future version.\n\n" + "Note: this will remain available in `narwhals.stable.v1`.\n" + "See https://narwhals-dev.github.io/narwhals/backcompat/ for more information.\n" + ) + issue_deprecation_warning(msg, _version="1.29.0") + + return super().gather_every(n=n, offset=offset) + + def unpivot( + self, + on: str | list[str] | None = None, + *, + index: str | list[str] | None = None, + variable_name: str = "variable", + value_name: str = "value", + ) -> Self: + r"""Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (index) while all other columns, considered + measured variables (on), are "unpivoted" to the row axis leaving just + two non-identifier columns, 'variable' and 'value'. + + Arguments: + on: Column(s) to use as values variables; if `on` is empty all columns that + are not in `index` will be used. + index: Column(s) to use as identifier variables. + variable_name: Name to give to the `variable` column. Defaults to "variable". + value_name: Name to give to the `value` column. Defaults to "value". + + Returns: + The unpivoted LazyFrame. + + Notes: + If you're coming from pandas, this is similar to `pandas.DataFrame.melt`, + but with `index` replacing `id_vars` and `on` replacing `value_vars`. + In other frameworks, you might know this operation as `pivot_longer`. + + Examples: + >>> import duckdb + >>> import narwhals as nw + >>> df_native = duckdb.sql( + ... "SELECT * FROM VALUES ('x', 1, 2), ('y', 3, 4), ('z', 5, 6) df(a, b, c)" + ... ) + >>> df = nw.from_native(df_native) + >>> df.unpivot(on=["b", "c"], index="a").sort("a", "variable").to_native() + ┌─────────┬──────────┬───────┐ + │ a │ variable │ value │ + │ varchar │ varchar │ int32 │ + ├─────────┼──────────┼───────┤ + │ x │ b │ 1 │ + │ x │ c │ 2 │ + │ y │ b │ 3 │ + │ y │ c │ 4 │ + │ z │ b │ 5 │ + │ z │ c │ 6 │ + └─────────┴──────────┴───────┘ + + """ + return super().unpivot( + on=on, index=index, variable_name=variable_name, value_name=value_name + ) + + def explode(self, columns: str | Sequence[str], *more_columns: str) -> Self: + """Explode the dataframe to long format by exploding the given columns. + + Notes: + It is possible to explode multiple columns only if these columns have + matching element counts. + + Arguments: + columns: Column names. The underlying columns being exploded must be of the `List` data type. + *more_columns: Additional names of columns to explode, specified as positional arguments. + + Returns: + New LazyFrame + + Examples: + >>> import duckdb + >>> import narwhals as nw + >>> df_native = duckdb.sql( + ... "SELECT * FROM VALUES ('x', [1, 2]), ('y', [3, 4]), ('z', [5, 6]) df(a, b)" + ... ) + >>> df = nw.from_native(df_native) + >>> df.explode("b").to_native() + ┌─────────┬───────┐ + │ a │ b │ + │ varchar │ int32 │ + ├─────────┼───────┤ + │ x │ 1 │ + │ x │ 2 │ + │ y │ 3 │ + │ y │ 4 │ + │ z │ 5 │ + │ z │ 6 │ + └─────────┴───────┘ + + """ + return super().explode(columns, *more_columns) -- cgit v1.2.3-70-g09d2