diff options
Diffstat (limited to 'venv/lib/python3.8/site-packages/narwhals/expr_str.py')
-rw-r--r-- | venv/lib/python3.8/site-packages/narwhals/expr_str.py | 449 |
1 files changed, 449 insertions, 0 deletions
diff --git a/venv/lib/python3.8/site-packages/narwhals/expr_str.py b/venv/lib/python3.8/site-packages/narwhals/expr_str.py new file mode 100644 index 0000000..e598ff7 --- /dev/null +++ b/venv/lib/python3.8/site-packages/narwhals/expr_str.py @@ -0,0 +1,449 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Generic, TypeVar + +if TYPE_CHECKING: + from narwhals.expr import Expr + +ExprT = TypeVar("ExprT", bound="Expr") + + +class ExprStringNamespace(Generic[ExprT]): + def __init__(self, expr: ExprT) -> None: + self._expr = expr + + def len_chars(self) -> ExprT: + r"""Return the length of each string as the number of characters. + + Returns: + A new expression. + + Examples: + >>> import polars as pl + >>> import narwhals as nw + >>> df_native = pl.DataFrame({"words": ["foo", "345", None]}) + >>> df = nw.from_native(df_native) + >>> df.with_columns(words_len=nw.col("words").str.len_chars()) + ┌─────────────────────┐ + | Narwhals DataFrame | + |---------------------| + |shape: (3, 2) | + |┌───────┬───────────┐| + |│ words ┆ words_len │| + |│ --- ┆ --- │| + |│ str ┆ u32 │| + |╞═══════╪═══════════╡| + |│ foo ┆ 3 │| + |│ 345 ┆ 3 │| + |│ null ┆ null │| + |└───────┴───────────┘| + └─────────────────────┘ + """ + return self._expr._with_elementwise_op( + lambda plx: self._expr._to_compliant_expr(plx).str.len_chars() + ) + + def replace( + self, pattern: str, value: str, *, literal: bool = False, n: int = 1 + ) -> ExprT: + r"""Replace first matching regex/literal substring with a new string value. + + Arguments: + pattern: A valid regular expression pattern. + value: String that will replace the matched substring. + literal: Treat `pattern` as a literal string. + n: Number of matches to replace. + + Returns: + A new expression. + + Examples: + >>> import pandas as pd + >>> import narwhals as nw + >>> df_native = pd.DataFrame({"foo": ["123abc", "abc abc123"]}) + >>> df = nw.from_native(df_native) + >>> df.with_columns(replaced=nw.col("foo").str.replace("abc", "")) + ┌──────────────────────┐ + | Narwhals DataFrame | + |----------------------| + | foo replaced| + |0 123abc 123| + |1 abc abc123 abc123| + └──────────────────────┘ + """ + return self._expr._with_elementwise_op( + lambda plx: self._expr._to_compliant_expr(plx).str.replace( + pattern, value, literal=literal, n=n + ) + ) + + def replace_all(self, pattern: str, value: str, *, literal: bool = False) -> ExprT: + r"""Replace all matching regex/literal substring with a new string value. + + Arguments: + pattern: A valid regular expression pattern. + value: String that will replace the matched substring. + literal: Treat `pattern` as a literal string. + + Returns: + A new expression. + + Examples: + >>> import pandas as pd + >>> import narwhals as nw + >>> df_native = pd.DataFrame({"foo": ["123abc", "abc abc123"]}) + >>> df = nw.from_native(df_native) + >>> df.with_columns(replaced=nw.col("foo").str.replace_all("abc", "")) + ┌──────────────────────┐ + | Narwhals DataFrame | + |----------------------| + | foo replaced| + |0 123abc 123| + |1 abc abc123 123| + └──────────────────────┘ + """ + return self._expr._with_elementwise_op( + lambda plx: self._expr._to_compliant_expr(plx).str.replace_all( + pattern, value, literal=literal + ) + ) + + def strip_chars(self, characters: str | None = None) -> ExprT: + r"""Remove leading and trailing characters. + + Arguments: + characters: The set of characters to be removed. All combinations of this + set of characters will be stripped from the start and end of the string. + If set to None (default), all leading and trailing whitespace is removed + instead. + + Returns: + A new expression. + + Examples: + >>> import polars as pl + >>> import narwhals as nw + >>> df_native = pl.DataFrame({"fruits": ["apple", "\nmango"]}) + >>> df = nw.from_native(df_native) + >>> df.with_columns(stripped=nw.col("fruits").str.strip_chars()).to_dict( + ... as_series=False + ... ) + {'fruits': ['apple', '\nmango'], 'stripped': ['apple', 'mango']} + """ + return self._expr._with_elementwise_op( + lambda plx: self._expr._to_compliant_expr(plx).str.strip_chars(characters) + ) + + def starts_with(self, prefix: str) -> ExprT: + r"""Check if string values start with a substring. + + Arguments: + prefix: prefix substring + + Returns: + A new expression. + + Examples: + >>> import pandas as pd + >>> import narwhals as nw + >>> df_native = pd.DataFrame({"fruits": ["apple", "mango", None]}) + >>> df = nw.from_native(df_native) + >>> df.with_columns(has_prefix=nw.col("fruits").str.starts_with("app")) + ┌───────────────────┐ + |Narwhals DataFrame | + |-------------------| + | fruits has_prefix| + |0 apple True| + |1 mango False| + |2 None None| + └───────────────────┘ + """ + return self._expr._with_elementwise_op( + lambda plx: self._expr._to_compliant_expr(plx).str.starts_with(prefix) + ) + + def ends_with(self, suffix: str) -> ExprT: + r"""Check if string values end with a substring. + + Arguments: + suffix: suffix substring + + Returns: + A new expression. + + Examples: + >>> import pandas as pd + >>> import narwhals as nw + >>> df_native = pd.DataFrame({"fruits": ["apple", "mango", None]}) + >>> df = nw.from_native(df_native) + >>> df.with_columns(has_suffix=nw.col("fruits").str.ends_with("ngo")) + ┌───────────────────┐ + |Narwhals DataFrame | + |-------------------| + | fruits has_suffix| + |0 apple False| + |1 mango True| + |2 None None| + └───────────────────┘ + """ + return self._expr._with_elementwise_op( + lambda plx: self._expr._to_compliant_expr(plx).str.ends_with(suffix) + ) + + def contains(self, pattern: str, *, literal: bool = False) -> ExprT: + r"""Check if string contains a substring that matches a pattern. + + Arguments: + pattern: A Character sequence or valid regular expression pattern. + literal: If True, treats the pattern as a literal string. + If False, assumes the pattern is a regular expression. + + Returns: + A new expression. + + Examples: + >>> import pyarrow as pa + >>> import narwhals as nw + >>> df_native = pa.table({"pets": ["cat", "dog", "rabbit and parrot"]}) + >>> df = nw.from_native(df_native) + >>> df.with_columns( + ... default_match=nw.col("pets").str.contains("cat|parrot"), + ... case_insensitive_match=nw.col("pets").str.contains("cat|(?i)parrot"), + ... ).to_native() + pyarrow.Table + pets: string + default_match: bool + case_insensitive_match: bool + ---- + pets: [["cat","dog","rabbit and parrot"]] + default_match: [[true,false,true]] + case_insensitive_match: [[true,false,true]] + """ + return self._expr._with_elementwise_op( + lambda plx: self._expr._to_compliant_expr(plx).str.contains( + pattern, literal=literal + ) + ) + + def slice(self, offset: int, length: int | None = None) -> ExprT: + r"""Create subslices of the string values of an expression. + + Arguments: + offset: Start index. Negative indexing is supported. + length: Length of the slice. If set to `None` (default), the slice is taken to the + end of the string. + + Returns: + A new expression. + + Examples: + >>> import pandas as pd + >>> import narwhals as nw + >>> df_native = pd.DataFrame({"s": ["pear", None, "papaya"]}) + >>> df = nw.from_native(df_native) + >>> df.with_columns(s_sliced=nw.col("s").str.slice(4, length=3)) + ┌──────────────────┐ + |Narwhals DataFrame| + |------------------| + | s s_sliced| + |0 pear | + |1 None None| + |2 papaya ya| + └──────────────────┘ + """ + return self._expr._with_elementwise_op( + lambda plx: self._expr._to_compliant_expr(plx).str.slice( + offset=offset, length=length + ) + ) + + def split(self, by: str) -> ExprT: + r"""Split the string values of an expression by a substring. + + Arguments: + by: Substring to split by. + + Returns: + A new expression. + + Examples: + >>> import polars as pl + >>> import narwhals as nw + >>> df_native = pl.DataFrame({"s": ["foo bar", "foo_bar"]}) + >>> df = nw.from_native(df_native) + >>> df.with_columns(nw.col("s").str.split("_").alias("s_split")) + ┌────────────────────────────┐ + | Narwhals DataFrame | + |----------------------------| + |shape: (2, 2) | + |┌─────────┬────────────────┐| + |│ s ┆ s_split │| + |│ --- ┆ --- │| + |│ str ┆ list[str] │| + |╞═════════╪════════════════╡| + |│ foo bar ┆ ["foo bar"] │| + |│ foo_bar ┆ ["foo", "bar"] │| + |└─────────┴────────────────┘| + └────────────────────────────┘ + """ + return self._expr._with_elementwise_op( + lambda plx: self._expr._to_compliant_expr(plx).str.split(by=by) + ) + + def head(self, n: int = 5) -> ExprT: + r"""Take the first n elements of each string. + + Arguments: + n: Number of elements to take. Negative indexing is **not** supported. + + Returns: + A new expression. + + Notes: + If the length of the string has fewer than `n` characters, the full string is returned. + + Examples: + >>> import pyarrow as pa + >>> import narwhals as nw + >>> df_native = pa.table({"lyrics": ["taata", "taatatata", "zukkyun"]}) + >>> df = nw.from_native(df_native) + >>> df.with_columns(lyrics_head=nw.col("lyrics").str.head()).to_native() + pyarrow.Table + lyrics: string + lyrics_head: string + ---- + lyrics: [["taata","taatatata","zukkyun"]] + lyrics_head: [["taata","taata","zukky"]] + """ + return self._expr._with_elementwise_op( + lambda plx: self._expr._to_compliant_expr(plx).str.slice(0, n) + ) + + def tail(self, n: int = 5) -> ExprT: + r"""Take the last n elements of each string. + + Arguments: + n: Number of elements to take. Negative indexing is **not** supported. + + Returns: + A new expression. + + Notes: + If the length of the string has fewer than `n` characters, the full string is returned. + + Examples: + >>> import pyarrow as pa + >>> import narwhals as nw + >>> df_native = pa.table({"lyrics": ["taata", "taatatata", "zukkyun"]}) + >>> df = nw.from_native(df_native) + >>> df.with_columns(lyrics_tail=nw.col("lyrics").str.tail()).to_native() + pyarrow.Table + lyrics: string + lyrics_tail: string + ---- + lyrics: [["taata","taatatata","zukkyun"]] + lyrics_tail: [["taata","atata","kkyun"]] + """ + return self._expr._with_elementwise_op( + lambda plx: self._expr._to_compliant_expr(plx).str.slice( + offset=-n, length=None + ) + ) + + def to_datetime(self, format: str | None = None) -> ExprT: + """Convert to Datetime dtype. + + Notes: + - pandas defaults to nanosecond time unit, Polars to microsecond. + Prior to pandas 2.0, nanoseconds were the only time unit supported + in pandas, with no ability to set any other one. The ability to + set the time unit in pandas, if the version permits, will arrive. + - timezone-aware strings are all converted to and parsed as UTC. + + Warning: + As different backends auto-infer format in different ways, if `format=None` + there is no guarantee that the result will be equal. + + Arguments: + format: Format to use for conversion. If set to None (default), the format is + inferred from the data. + + Returns: + A new expression. + + Examples: + >>> import polars as pl + >>> import narwhals as nw + >>> df_native = pl.DataFrame({"a": ["2020-01-01", "2020-01-02"]}) + >>> df = nw.from_native(df_native) + >>> df.select(nw.col("a").str.to_datetime(format="%Y-%m-%d")) + ┌───────────────────────┐ + | Narwhals DataFrame | + |-----------------------| + |shape: (2, 1) | + |┌─────────────────────┐| + |│ a │| + |│ --- │| + |│ datetime[μs] │| + |╞═════════════════════╡| + |│ 2020-01-01 00:00:00 │| + |│ 2020-01-02 00:00:00 │| + |└─────────────────────┘| + └───────────────────────┘ + """ + return self._expr._with_elementwise_op( + lambda plx: self._expr._to_compliant_expr(plx).str.to_datetime(format=format) + ) + + def to_uppercase(self) -> ExprT: + r"""Transform string to uppercase variant. + + Returns: + A new expression. + + Notes: + The PyArrow backend will convert 'ß' to 'ẞ' instead of 'SS'. + For more info see [the related issue](https://github.com/apache/arrow/issues/34599). + There may be other unicode-edge-case-related variations across implementations. + + Examples: + >>> import pandas as pd + >>> import narwhals as nw + >>> df_native = pd.DataFrame({"fruits": ["apple", None]}) + >>> df = nw.from_native(df_native) + >>> df.with_columns(upper_col=nw.col("fruits").str.to_uppercase()) + ┌──────────────────┐ + |Narwhals DataFrame| + |------------------| + | fruits upper_col| + |0 apple APPLE| + |1 None None| + └──────────────────┘ + """ + return self._expr._with_elementwise_op( + lambda plx: self._expr._to_compliant_expr(plx).str.to_uppercase() + ) + + def to_lowercase(self) -> ExprT: + r"""Transform string to lowercase variant. + + Returns: + A new expression. + + Examples: + >>> import pandas as pd + >>> import narwhals as nw + >>> df_native = pd.DataFrame({"fruits": ["APPLE", None]}) + >>> df = nw.from_native(df_native) + >>> df.with_columns(lower_col=nw.col("fruits").str.to_lowercase()) + ┌──────────────────┐ + |Narwhals DataFrame| + |------------------| + | fruits lower_col| + |0 APPLE apple| + |1 None None| + └──────────────────┘ + """ + return self._expr._with_elementwise_op( + lambda plx: self._expr._to_compliant_expr(plx).str.to_lowercase() + ) |