aboutsummaryrefslogtreecommitdiff
path: root/venv/lib/python3.8/site-packages/plotly/data
diff options
context:
space:
mode:
authorsotech117 <michael_foiani@brown.edu>2025-07-31 17:27:24 -0400
committersotech117 <michael_foiani@brown.edu>2025-07-31 17:27:24 -0400
commit5bf22fc7e3c392c8bd44315ca2d06d7dca7d084e (patch)
tree8dacb0f195df1c0788d36dd0064f6bbaa3143ede /venv/lib/python3.8/site-packages/plotly/data
parentb832d364da8c2efe09e3f75828caf73c50d01ce3 (diff)
add code for analysis of data
Diffstat (limited to 'venv/lib/python3.8/site-packages/plotly/data')
-rw-r--r--venv/lib/python3.8/site-packages/plotly/data/__init__.py430
1 files changed, 430 insertions, 0 deletions
diff --git a/venv/lib/python3.8/site-packages/plotly/data/__init__.py b/venv/lib/python3.8/site-packages/plotly/data/__init__.py
new file mode 100644
index 0000000..3bba389
--- /dev/null
+++ b/venv/lib/python3.8/site-packages/plotly/data/__init__.py
@@ -0,0 +1,430 @@
+"""
+Built-in datasets for demonstration, educational and test purposes.
+"""
+
+import os
+from importlib import import_module
+
+import narwhals.stable.v1 as nw
+
+AVAILABLE_BACKENDS = {"pandas", "polars", "pyarrow", "modin", "cudf"}
+BACKENDS_WITH_INDEX_SUPPORT = {"pandas", "modin", "cudf"}
+
+
+def gapminder(
+ datetimes=False,
+ centroids=False,
+ year=None,
+ pretty_names=False,
+ return_type="pandas",
+):
+ """
+ Each row represents a country on a given year.
+
+ https://www.gapminder.org/data/
+
+ Parameters
+ ----------
+ datetimes: bool
+ Whether or not 'year' column will converted to datetime type
+
+ centroids: bool
+ If True, ['centroid_lat', 'centroid_lon'] columns are added
+
+ year: int | None
+ If provided, the dataset will be filtered for that year
+
+ pretty_names: bool
+ If True, prettifies the column names
+
+ return_type: {'pandas', 'polars', 'pyarrow', 'modin', 'cudf'}
+ Type of the resulting dataframe
+
+ Returns
+ -------
+ Dataframe of `return_type` type
+ Dataframe with 1704 rows and the following columns:
+ `['country', 'continent', 'year', 'lifeExp', 'pop', 'gdpPercap',
+ 'iso_alpha', 'iso_num']`.
+
+ If `datetimes` is True, the 'year' column will be a datetime column
+ If `centroids` is True, two new columns are added: ['centroid_lat', 'centroid_lon']
+ If `year` is an integer, the dataset will be filtered for that year
+ """
+ df = nw.from_native(
+ _get_dataset("gapminder", return_type=return_type), eager_only=True
+ )
+ if year:
+ df = df.filter(nw.col("year") == year)
+ if datetimes:
+ df = df.with_columns(
+ # Concatenate the year value with the literal "-01-01" so that it can be
+ # casted to datetime from "%Y-%m-%d" format
+ nw.concat_str(
+ [nw.col("year").cast(nw.String()), nw.lit("-01-01")]
+ ).str.to_datetime(format="%Y-%m-%d")
+ )
+ if not centroids:
+ df = df.drop("centroid_lat", "centroid_lon")
+ if pretty_names:
+ df = df.rename(
+ dict(
+ country="Country",
+ continent="Continent",
+ year="Year",
+ lifeExp="Life Expectancy",
+ gdpPercap="GDP per Capita",
+ pop="Population",
+ iso_alpha="ISO Alpha Country Code",
+ iso_num="ISO Numeric Country Code",
+ centroid_lat="Centroid Latitude",
+ centroid_lon="Centroid Longitude",
+ )
+ )
+ return df.to_native()
+
+
+def tips(pretty_names=False, return_type="pandas"):
+ """
+ Each row represents a restaurant bill.
+
+ https://vincentarelbundock.github.io/Rdatasets/doc/reshape2/tips.html
+
+ Parameters
+ ----------
+ pretty_names: bool
+ If True, prettifies the column names
+
+ return_type: {'pandas', 'polars', 'pyarrow', 'modin', 'cudf'}
+ Type of the resulting dataframe
+
+ Returns
+ -------
+ Dataframe of `return_type` type
+ Dataframe with 244 rows and the following columns:
+ `['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size']`.
+ """
+
+ df = nw.from_native(_get_dataset("tips", return_type=return_type), eager_only=True)
+ if pretty_names:
+ df = df.rename(
+ dict(
+ total_bill="Total Bill",
+ tip="Tip",
+ sex="Payer Gender",
+ smoker="Smokers at Table",
+ day="Day of Week",
+ time="Meal",
+ size="Party Size",
+ )
+ )
+ return df.to_native()
+
+
+def iris(return_type="pandas"):
+ """
+ Each row represents a flower.
+
+ https://en.wikipedia.org/wiki/Iris_flower_data_set
+
+ Parameters
+ ----------
+ return_type: {'pandas', 'polars', 'pyarrow', 'modin', 'cudf'}
+ Type of the resulting dataframe
+
+ Returns
+ -------
+ Dataframe of `return_type` type
+ Dataframe with 150 rows and the following columns:
+ `['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species', 'species_id']`.
+ """
+ return _get_dataset("iris", return_type=return_type)
+
+
+def wind(return_type="pandas"):
+ """
+ Each row represents a level of wind intensity in a cardinal direction, and its frequency.
+
+ Parameters
+ ----------
+ return_type: {'pandas', 'polars', 'pyarrow', 'modin', 'cudf'}
+ Type of the resulting dataframe
+
+ Returns
+ -------
+ Dataframe of `return_type` type
+ Dataframe with 128 rows and the following columns:
+ `['direction', 'strength', 'frequency']`.
+ """
+ return _get_dataset("wind", return_type=return_type)
+
+
+def election(return_type="pandas"):
+ """
+ Each row represents voting results for an electoral district in the 2013 Montreal
+ mayoral election.
+
+ Parameters
+ ----------
+ return_type: {'pandas', 'polars', 'pyarrow', 'modin', 'cudf'}
+ Type of the resulting dataframe
+
+ Returns
+ -------
+ Dataframe of `return_type` type
+ Dataframe with 58 rows and the following columns:
+ `['district', 'Coderre', 'Bergeron', 'Joly', 'total', 'winner', 'result', 'district_id']`.
+ """
+ return _get_dataset("election", return_type=return_type)
+
+
+def election_geojson():
+ """
+ Each feature represents an electoral district in the 2013 Montreal mayoral election.
+
+ Returns
+ -------
+ A GeoJSON-formatted `dict` with 58 polygon or multi-polygon features whose `id`
+ is an electoral district numerical ID and whose `district` property is the ID and
+ district name.
+ """
+ import gzip
+ import json
+ import os
+
+ path = os.path.join(
+ os.path.dirname(os.path.dirname(__file__)),
+ "package_data",
+ "datasets",
+ "election.geojson.gz",
+ )
+ with gzip.GzipFile(path, "r") as f:
+ result = json.loads(f.read().decode("utf-8"))
+ return result
+
+
+def carshare(return_type="pandas"):
+ """
+ Each row represents the availability of car-sharing services near the centroid of a zone
+ in Montreal over a month-long period.
+
+ Parameters
+ ----------
+ return_type: {'pandas', 'polars', 'pyarrow', 'modin', 'cudf'}
+ Type of the resulting dataframe
+
+ Returns
+ -------
+ Dataframe of `return_type` type
+ Dataframe` with 249 rows and the following columns:
+ `['centroid_lat', 'centroid_lon', 'car_hours', 'peak_hour']`.
+ """
+ return _get_dataset("carshare", return_type=return_type)
+
+
+def stocks(indexed=False, datetimes=False, return_type="pandas"):
+ """
+ Each row in this wide dataset represents closing prices from 6 tech stocks in 2018/2019.
+
+ Parameters
+ ----------
+ indexed: bool
+ Whether or not the 'date' column is used as the index and the column index
+ is named 'company'. Applicable only if `return_type='pandas'`
+
+ datetimes: bool
+ Whether or not the 'date' column will be of datetime type
+
+ return_type: {'pandas', 'polars', 'pyarrow', 'modin', 'cudf'}
+ Type of the resulting dataframe
+
+ Returns
+ -------
+ Dataframe of `return_type` type
+ Dataframe with 100 rows and the following columns:
+ `['date', 'GOOG', 'AAPL', 'AMZN', 'FB', 'NFLX', 'MSFT']`.
+ If `indexed` is True, the 'date' column is used as the index and the column index
+ is named 'company'
+ If `datetimes` is True, the 'date' column will be a datetime column
+ """
+ if indexed and return_type not in BACKENDS_WITH_INDEX_SUPPORT:
+ msg = f"Backend '{return_type}' does not support setting index"
+ raise NotImplementedError(msg)
+
+ df = nw.from_native(
+ _get_dataset("stocks", return_type=return_type), eager_only=True
+ ).with_columns(nw.col("date").cast(nw.String()))
+
+ if datetimes:
+ df = df.with_columns(nw.col("date").str.to_datetime())
+
+ if indexed: # then it must be pandas
+ df = df.to_native().set_index("date")
+ df.columns.name = "company"
+ return df
+
+ return df.to_native()
+
+
+def experiment(indexed=False, return_type="pandas"):
+ """
+ Each row in this wide dataset represents the results of 100 simulated participants
+ on three hypothetical experiments, along with their gender and control/treatment group.
+
+ Parameters
+ ----------
+ indexed: bool
+ If True, then the index is named "participant".
+ Applicable only if `return_type='pandas'`
+
+ return_type: {'pandas', 'polars', 'pyarrow', 'modin', 'cudf'}
+ Type of the resulting dataframe
+
+ Returns
+ -------
+ Dataframe of `return_type` type
+ Dataframe with 100 rows and the following columns:
+ `['experiment_1', 'experiment_2', 'experiment_3', 'gender', 'group']`.
+ If `indexed` is True, the data frame index is named "participant"
+ """
+
+ if indexed and return_type not in BACKENDS_WITH_INDEX_SUPPORT:
+ msg = f"Backend '{return_type}' does not support setting index"
+ raise NotImplementedError(msg)
+
+ df = nw.from_native(
+ _get_dataset("experiment", return_type=return_type), eager_only=True
+ )
+ if indexed: # then it must be pandas
+ df = df.to_native()
+ df.index.name = "participant"
+ return df
+ return df.to_native()
+
+
+def medals_wide(indexed=False, return_type="pandas"):
+ """
+ This dataset represents the medal table for Olympic Short Track Speed Skating for the
+ top three nations as of 2020.
+
+ Parameters
+ ----------
+ indexed: bool
+ Whether or not the 'nation' column is used as the index and the column index
+ is named 'medal'. Applicable only if `return_type='pandas'`
+
+ return_type: {'pandas', 'polars', 'pyarrow', 'modin', 'cudf'}
+ Type of the resulting dataframe
+
+ Returns
+ -------
+ Dataframe of `return_type` type
+ Dataframe with 3 rows and the following columns:
+ `['nation', 'gold', 'silver', 'bronze']`.
+ If `indexed` is True, the 'nation' column is used as the index and the column index
+ is named 'medal'
+ """
+
+ if indexed and return_type not in BACKENDS_WITH_INDEX_SUPPORT:
+ msg = f"Backend '{return_type}' does not support setting index"
+ raise NotImplementedError(msg)
+
+ df = nw.from_native(
+ _get_dataset("medals", return_type=return_type), eager_only=True
+ )
+ if indexed: # then it must be pandas
+ df = df.to_native().set_index("nation")
+ df.columns.name = "medal"
+ return df
+ return df.to_native()
+
+
+def medals_long(indexed=False, return_type="pandas"):
+ """
+ This dataset represents the medal table for Olympic Short Track Speed Skating for the
+ top three nations as of 2020.
+
+ Parameters
+ ----------
+ indexed: bool
+ Whether or not the 'nation' column is used as the index.
+ Applicable only if `return_type='pandas'`
+
+ return_type: {'pandas', 'polars', 'pyarrow', 'modin', 'cudf'}
+ Type of the resulting dataframe
+
+ Returns
+ -------
+ Dataframe of `return_type` type
+ Dataframe with 9 rows and the following columns: `['nation', 'medal', 'count']`.
+ If `indexed` is True, the 'nation' column is used as the index.
+ """
+
+ if indexed and return_type not in BACKENDS_WITH_INDEX_SUPPORT:
+ msg = f"Backend '{return_type}' does not support setting index"
+ raise NotImplementedError(msg)
+
+ df = nw.from_native(
+ _get_dataset("medals", return_type=return_type), eager_only=True
+ ).unpivot(
+ index=["nation"],
+ value_name="count",
+ variable_name="medal",
+ )
+ if indexed:
+ df = nw.maybe_set_index(df, "nation")
+ return df.to_native()
+
+
+def _get_dataset(d, return_type):
+ """
+ Loads the dataset using the specified backend.
+
+ Notice that the available backends are 'pandas', 'polars', 'pyarrow' and they all have
+ a `read_csv` function (pyarrow has it via pyarrow.csv). Therefore we can dynamically
+ load the library using `importlib.import_module` and then call
+ `backend.read_csv(filepath)`.
+
+ Parameters
+ ----------
+ d: str
+ Name of the dataset to load.
+
+ return_type: {'pandas', 'polars', 'pyarrow', 'modin', 'cudf'}
+ Type of the resulting dataframe
+
+ Returns
+ -------
+ Dataframe of `return_type` type
+ """
+ filepath = os.path.join(
+ os.path.dirname(os.path.dirname(__file__)),
+ "package_data",
+ "datasets",
+ d + ".csv.gz",
+ )
+
+ if return_type not in AVAILABLE_BACKENDS:
+ msg = (
+ f"Unsupported return_type. Found {return_type}, expected one "
+ f"of {AVAILABLE_BACKENDS}"
+ )
+ raise NotImplementedError(msg)
+
+ try:
+ if return_type == "pyarrow":
+ module_to_load = "pyarrow.csv"
+ elif return_type == "modin":
+ module_to_load = "modin.pandas"
+ else:
+ module_to_load = return_type
+ backend = import_module(module_to_load)
+ except ModuleNotFoundError:
+ msg = f"return_type={return_type}, but {return_type} is not installed"
+ raise ModuleNotFoundError(msg)
+
+ try:
+ return backend.read_csv(filepath)
+ except Exception as e:
+ msg = f"Unable to read '{d}' dataset due to: {e}"
+ raise Exception(msg).with_traceback(e.__traceback__)