diff --git a/CHANGELOG.md b/CHANGELOG.md index 46b97c2210..f649f2f8a4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,31 @@ [1]: https://pypi.org/project/bigframes/#history +## [2.8.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v2.7.0...v2.8.0) (2025-06-23) + + +### ⚠ BREAKING CHANGES + +* add required param 'engine' to multimodal functions ([#1834](https://github.com/googleapis/python-bigquery-dataframes/issues/1834)) + +### Features + +* Add `bpd.options.compute.maximum_result_rows` option to limit client data download ([#1829](https://github.com/googleapis/python-bigquery-dataframes/issues/1829)) ([e22a3f6](https://github.com/googleapis/python-bigquery-dataframes/commit/e22a3f61a02cc1b7a5155556e5a07a1a2fea1d82)) +* Add `bpd.options.display.repr_mode = "anywidget"` to create an interactive display of the results ([#1820](https://github.com/googleapis/python-bigquery-dataframes/issues/1820)) ([be0a3cf](https://github.com/googleapis/python-bigquery-dataframes/commit/be0a3cf7711dadc68d8366ea90b99855773e2a2e)) +* Add DataFrame.ai.forecast() support ([#1828](https://github.com/googleapis/python-bigquery-dataframes/issues/1828)) ([7bc7f36](https://github.com/googleapis/python-bigquery-dataframes/commit/7bc7f36fc20d233f4cf5ed688cc5dcaf100ce4fb)) +* Add describe() method to Series ([#1827](https://github.com/googleapis/python-bigquery-dataframes/issues/1827)) ([a4205f8](https://github.com/googleapis/python-bigquery-dataframes/commit/a4205f882012820c034cb15d73b2768ec4ad3ac8)) +* Add required param 'engine' to multimodal functions ([#1834](https://github.com/googleapis/python-bigquery-dataframes/issues/1834)) ([37666e4](https://github.com/googleapis/python-bigquery-dataframes/commit/37666e4c137d52c28ab13477dfbcc6e92b913334)) + + +### Performance Improvements + +* Produce simpler sql ([#1836](https://github.com/googleapis/python-bigquery-dataframes/issues/1836)) ([cf9c22a](https://github.com/googleapis/python-bigquery-dataframes/commit/cf9c22a09c4e668a598fa1dad0f6a07b59bc6524)) + + +### Documentation + +* Add ai.forecast notebook ([#1840](https://github.com/googleapis/python-bigquery-dataframes/issues/1840)) ([2430497](https://github.com/googleapis/python-bigquery-dataframes/commit/24304972fdbdfd12c25c7f4ef5a7b280f334801a)) + ## [2.7.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v2.6.0...v2.7.0) (2025-06-16) diff --git a/bigframes/_config/compute_options.py b/bigframes/_config/compute_options.py index 89c0dc8d6a..97cd6e99af 100644 --- a/bigframes/_config/compute_options.py +++ b/bigframes/_config/compute_options.py @@ -55,29 +55,7 @@ class ComputeOptions: {'test2': 'abc', 'test3': False} Attributes: - maximum_bytes_billed (int, Options): - Limits the bytes billed for query jobs. Queries that will have - bytes billed beyond this limit will fail (without incurring a - charge). If unspecified, this will be set to your project default. - See `maximum_bytes_billed`: https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.job.QueryJobConfig#google_cloud_bigquery_job_QueryJobConfig_maximum_bytes_billed. - - enable_multi_query_execution (bool, Options): - If enabled, large queries may be factored into multiple smaller queries - in order to avoid generating queries that are too complex for the query - engine to handle. However this comes at the cost of increase cost and latency. - - extra_query_labels (Dict[str, Any], Options): - Stores additional custom labels for query configuration. - - semantic_ops_confirmation_threshold (int, optional): - .. deprecated:: 1.42.0 - Semantic operators are deprecated. Please use AI operators instead - - semantic_ops_threshold_autofail (bool): - .. deprecated:: 1.42.0 - Semantic operators are deprecated. Please use AI operators instead - - ai_ops_confirmation_threshold (int, optional): + ai_ops_confirmation_threshold (int | None): Guards against unexpected processing of large amount of rows by semantic operators. If the number of rows exceeds the threshold, the user will be asked to confirm their operations to resume. The default value is 0. Set the value to None @@ -87,26 +65,57 @@ class ComputeOptions: Guards against unexpected processing of large amount of rows by semantic operators. When set to True, the operation automatically fails without asking for user inputs. - allow_large_results (bool): + allow_large_results (bool | None): Specifies whether query results can exceed 10 GB. Defaults to False. Setting this to False (the default) restricts results to 10 GB for potentially faster execution; BigQuery will raise an error if this limit is exceeded. Setting to True removes this result size limit. + + enable_multi_query_execution (bool | None): + If enabled, large queries may be factored into multiple smaller queries + in order to avoid generating queries that are too complex for the query + engine to handle. However this comes at the cost of increase cost and latency. + + extra_query_labels (Dict[str, Any] | None): + Stores additional custom labels for query configuration. + + maximum_bytes_billed (int | None): + Limits the bytes billed for query jobs. Queries that will have + bytes billed beyond this limit will fail (without incurring a + charge). If unspecified, this will be set to your project default. + See `maximum_bytes_billed`: https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.job.QueryJobConfig#google_cloud_bigquery_job_QueryJobConfig_maximum_bytes_billed. + + maximum_result_rows (int | None): + Limits the number of rows in an execution result. When converting + a BigQuery DataFrames object to a pandas DataFrame or Series (e.g., + using ``.to_pandas()``, ``.peek()``, ``.__repr__()``, direct + iteration), the data is downloaded from BigQuery to the client + machine. This option restricts the number of rows that can be + downloaded. If the number of rows to be downloaded exceeds this + limit, a ``bigframes.exceptions.MaximumResultRowsExceeded`` + exception is raised. + + semantic_ops_confirmation_threshold (int | None): + .. deprecated:: 1.42.0 + Semantic operators are deprecated. Please use AI operators instead + + semantic_ops_threshold_autofail (bool): + .. deprecated:: 1.42.0 + Semantic operators are deprecated. Please use AI operators instead """ - maximum_bytes_billed: Optional[int] = None + ai_ops_confirmation_threshold: Optional[int] = 0 + ai_ops_threshold_autofail: bool = False + allow_large_results: Optional[bool] = None enable_multi_query_execution: bool = False extra_query_labels: Dict[str, Any] = dataclasses.field( default_factory=dict, init=False ) + maximum_bytes_billed: Optional[int] = None + maximum_result_rows: Optional[int] = None semantic_ops_confirmation_threshold: Optional[int] = 0 semantic_ops_threshold_autofail = False - ai_ops_confirmation_threshold: Optional[int] = 0 - ai_ops_threshold_autofail: bool = False - - allow_large_results: Optional[bool] = None - def assign_extra_query_labels(self, **kwargs: Any) -> None: """ Assigns additional custom labels for query configuration. The method updates the diff --git a/bigframes/_config/display_options.py b/bigframes/_config/display_options.py index dc8ab34f2a..430abc8ef0 100644 --- a/bigframes/_config/display_options.py +++ b/bigframes/_config/display_options.py @@ -29,7 +29,7 @@ class DisplayOptions: max_columns: int = 20 max_rows: int = 25 progress_bar: Optional[str] = "auto" - repr_mode: Literal["head", "deferred"] = "head" + repr_mode: Literal["head", "deferred", "anywidget"] = "head" max_info_columns: int = 100 max_info_rows: Optional[int] = 200000 diff --git a/bigframes/core/bigframe_node.py b/bigframes/core/bigframe_node.py index 45e3c40701..9054ab9ba0 100644 --- a/bigframes/core/bigframe_node.py +++ b/bigframes/core/bigframe_node.py @@ -20,9 +20,19 @@ import functools import itertools import typing -from typing import Callable, Dict, Generator, Iterable, Mapping, Sequence, Set, Tuple - -from bigframes.core import field, identifiers +from typing import ( + Callable, + Dict, + Generator, + Iterable, + Mapping, + Sequence, + Set, + Tuple, + Union, +) + +from bigframes.core import expression, field, identifiers import bigframes.core.schema as schemata import bigframes.dtypes @@ -278,6 +288,13 @@ def _dtype_lookup(self) -> dict[identifiers.ColumnId, bigframes.dtypes.Dtype]: def field_by_id(self) -> Mapping[identifiers.ColumnId, field.Field]: return {field.id: field for field in self.fields} + @property + def _node_expressions( + self, + ) -> Sequence[Union[expression.Expression, expression.Aggregation]]: + """List of scalar expressions. Intended for checking engine compatibility with used ops.""" + return () + # Plan algorithms def unique_nodes( self: BigFrameNode, diff --git a/bigframes/core/compile/compiler.py b/bigframes/core/compile/compiler.py index 451783602d..0efbd47ae4 100644 --- a/bigframes/core/compile/compiler.py +++ b/bigframes/core/compile/compiler.py @@ -65,6 +65,7 @@ def compile_sql(request: configs.CompileRequest) -> configs.CompileResult: ordering: Optional[bf_ordering.RowOrdering] = result_node.order_by result_node = dataclasses.replace(result_node, order_by=None) result_node = cast(nodes.ResultNode, rewrites.column_pruning(result_node)) + result_node = cast(nodes.ResultNode, rewrites.defer_selection(result_node)) sql = compile_result_node(result_node) # Return the ordering iff no extra columns are needed to define the row order if ordering is not None: diff --git a/bigframes/core/compile/googlesql/query.py b/bigframes/core/compile/googlesql/query.py index e3b7a2c8ca..f591216b3a 100644 --- a/bigframes/core/compile/googlesql/query.py +++ b/bigframes/core/compile/googlesql/query.py @@ -83,7 +83,13 @@ def _select_field(self, field) -> SelectExpression: return SelectExpression(expression=expr.ColumnExpression(name=field)) else: - alias = field[1] if (field[0] != field[1]) else None + alias = ( + expr.AliasExpression(field[1]) + if isinstance(field[1], str) + else field[1] + if (field[0] != field[1]) + else None + ) return SelectExpression( expression=expr.ColumnExpression(name=field[0]), alias=alias ) @@ -119,7 +125,7 @@ def sql(self) -> str: return "\n".join(text) -@dataclasses.dataclass +@dataclasses.dataclass(frozen=True) class SelectExpression(abc.SQLSyntax): """This class represents `select_expression`.""" diff --git a/bigframes/core/compile/sqlglot/compiler.py b/bigframes/core/compile/sqlglot/compiler.py index 84fd7124ba..3b7abd8463 100644 --- a/bigframes/core/compile/sqlglot/compiler.py +++ b/bigframes/core/compile/sqlglot/compiler.py @@ -87,6 +87,9 @@ def _compile_sql(self, request: configs.CompileRequest) -> configs.CompileResult nodes.ResultNode, rewrite.column_pruning(result_node) ) result_node = self._remap_variables(result_node) + result_node = typing.cast( + nodes.ResultNode, rewrite.defer_selection(result_node) + ) sql = self._compile_result_node(result_node) return configs.CompileResult( sql, result_node.schema.to_bigquery(), result_node.order_by @@ -97,6 +100,9 @@ def _compile_sql(self, request: configs.CompileRequest) -> configs.CompileResult result_node = typing.cast(nodes.ResultNode, rewrite.column_pruning(result_node)) result_node = self._remap_variables(result_node) + result_node = typing.cast( + nodes.ResultNode, rewrite.defer_selection(result_node) + ) sql = self._compile_result_node(result_node) # Return the ordering iff no extra columns are needed to define the row order if ordering is not None: @@ -125,10 +131,7 @@ def _compile_result_node(self, root: nodes.ResultNode) -> str: (name, scalar_compiler.compile_scalar_expression(ref)) for ref, name in root.output_cols ) - # Skip squashing selections to ensure the right ordering and limit keys - sqlglot_ir = self.compile_node(root.child).select( - selected_cols, squash_selections=False - ) + sqlglot_ir = self.compile_node(root.child).select(selected_cols) if root.order_by is not None: ordering_cols = tuple( diff --git a/bigframes/core/compile/sqlglot/sqlglot_ir.py b/bigframes/core/compile/sqlglot/sqlglot_ir.py index 77ee0ccb78..47dab209d0 100644 --- a/bigframes/core/compile/sqlglot/sqlglot_ir.py +++ b/bigframes/core/compile/sqlglot/sqlglot_ir.py @@ -203,7 +203,6 @@ def from_union( def select( self, selected_cols: tuple[tuple[str, sge.Expression], ...], - squash_selections: bool = True, ) -> SQLGlotIR: selections = [ sge.Alias( @@ -213,15 +212,6 @@ def select( for id, expr in selected_cols ] - # If squashing is enabled, we try to simplify the selections - # by checking if the new selections are simply aliases of the - # original columns. - if squash_selections: - new_selections = _squash_selections(self.expr.expressions, selections) - if new_selections != []: - new_expr = self.expr.select(*new_selections, append=False) - return SQLGlotIR(expr=new_expr, uid_gen=self.uid_gen) - new_expr = self._encapsulate_as_cte().select(*selections, append=False) return SQLGlotIR(expr=new_expr, uid_gen=self.uid_gen) @@ -361,63 +351,3 @@ def _table(table: bigquery.TableReference) -> sge.Table: db=sg.to_identifier(table.dataset_id, quoted=True), catalog=sg.to_identifier(table.project, quoted=True), ) - - -def _squash_selections( - old_expr: list[sge.Expression], new_expr: list[sge.Alias] -) -> list[sge.Alias]: - """ - TODO: Reanble this function to optimize the SQL. - Simplifies the select column expressions if existing (old_expr) and - new (new_expr) selected columns are both simple aliases of column definitions. - - Example: - old_expr: [A AS X, B AS Y] - new_expr: [X AS P, Y AS Q] - Result: [A AS P, B AS Q] - """ - old_alias_map: typing.Dict[str, str] = {} - for selected in old_expr: - column_alias_pair = _get_column_alias_pair(selected) - if column_alias_pair is None: - return [] - else: - old_alias_map[column_alias_pair[1]] = column_alias_pair[0] - - new_selected_cols: typing.List[sge.Alias] = [] - for selected in new_expr: - column_alias_pair = _get_column_alias_pair(selected) - if column_alias_pair is None or column_alias_pair[0] not in old_alias_map: - return [] - else: - new_alias_expr = sge.Alias( - this=sge.ColumnDef( - this=sge.to_identifier( - old_alias_map[column_alias_pair[0]], quoted=True - ) - ), - alias=sg.to_identifier(column_alias_pair[1], quoted=True), - ) - new_selected_cols.append(new_alias_expr) - return new_selected_cols - - -def _get_column_alias_pair( - expr: sge.Expression, -) -> typing.Optional[typing.Tuple[str, str]]: - """Checks if an expression is a simple alias of a column definition - (e.g., "column_name AS alias_name"). - If it is, returns a tuple containing the alias name and original column name. - Returns `None` otherwise. - """ - if not isinstance(expr, sge.Alias): - return None - if not isinstance(expr.this, sge.ColumnDef): - return None - - column_def_expr: sge.ColumnDef = expr.this - if not isinstance(column_def_expr.this, sge.Identifier): - return None - - original_identifier: sge.Identifier = column_def_expr.this - return (original_identifier.this, expr.alias) diff --git a/bigframes/core/indexes/base.py b/bigframes/core/indexes/base.py index 836d84b46a..bc8b47d216 100644 --- a/bigframes/core/indexes/base.py +++ b/bigframes/core/indexes/base.py @@ -251,7 +251,9 @@ def __repr__(self) -> str: # metadata, like we do with DataFrame. opts = bigframes.options.display max_results = opts.max_rows - if opts.repr_mode == "deferred": + # anywdiget mode uses the same display logic as the "deferred" mode + # for faster execution + if opts.repr_mode in ("deferred", "anywidget"): _, dry_run_query_job = self._block._compute_dry_run() return formatter.repr_query_job(dry_run_query_job) diff --git a/bigframes/core/nodes.py b/bigframes/core/nodes.py index 9dcd74182b..38becd29df 100644 --- a/bigframes/core/nodes.py +++ b/bigframes/core/nodes.py @@ -75,7 +75,7 @@ def additive_base(self) -> BigFrameNode: ... @abc.abstractmethod - def replace_additive_base(self, BigFrameNode): + def replace_additive_base(self, BigFrameNode) -> BigFrameNode: ... @@ -274,6 +274,10 @@ def joins_nulls(self) -> bool: right_nullable = self.right_child.field_by_id[self.right_col.id].nullable return left_nullable or right_nullable + @property + def _node_expressions(self): + return (self.left_col, self.right_col) + def replace_additive_base(self, node: BigFrameNode): return dataclasses.replace(self, left_child=node) @@ -387,6 +391,10 @@ def referenced_ids(self) -> COLUMN_SET: def consumed_ids(self) -> COLUMN_SET: return frozenset(*self.ids, *self.referenced_ids) + @property + def _node_expressions(self): + return tuple(itertools.chain.from_iterable(self.conditions)) + def transform_children(self, t: Callable[[BigFrameNode], BigFrameNode]) -> JoinNode: transformed = dataclasses.replace( self, left_child=t(self.left_child), right_child=t(self.right_child) @@ -996,6 +1004,10 @@ def consumed_ids(self) -> COLUMN_SET: def referenced_ids(self) -> COLUMN_SET: return frozenset(self.predicate.column_references) + @property + def _node_expressions(self): + return (self.predicate,) + def remap_vars( self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId] ) -> FilterNode: @@ -1050,6 +1062,10 @@ def referenced_ids(self) -> COLUMN_SET: itertools.chain.from_iterable(map(lambda x: x.referenced_columns, self.by)) ) + @property + def _node_expressions(self): + return tuple(map(lambda x: x.scalar_expression, self.by)) + def remap_vars( self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId] ) -> OrderByNode: @@ -1178,6 +1194,10 @@ def node_defined_ids(self) -> Tuple[identifiers.ColumnId, ...]: def consumed_ids(self) -> COLUMN_SET: return frozenset(ref.id for ref, id in self.input_output_pairs) + @property + def _node_expressions(self): + return tuple(ref for ref, id in self.input_output_pairs) + def get_id_mapping(self) -> dict[identifiers.ColumnId, identifiers.ColumnId]: return {ref.id: id for ref, id in self.input_output_pairs} @@ -1265,6 +1285,10 @@ def referenced_ids(self) -> COLUMN_SET: ) ) + @property + def _node_expressions(self): + return tuple(ex for ex, id in self.assignments) + @property def additive_base(self) -> BigFrameNode: return self.child @@ -1361,6 +1385,13 @@ def has_ordered_ops(self) -> bool: aggregate.op.order_independent for aggregate, _ in self.aggregations ) + @property + def _node_expressions(self): + by_ids = (ref for ref in self.by_column_ids) + aggs = tuple(agg for agg, _ in self.aggregations) + order_ids = tuple(part.scalar_expression for part in self.order_by) + return (*by_ids, *aggs, *order_ids) + def remap_vars( self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId] ) -> AggregateNode: @@ -1463,6 +1494,10 @@ def inherits_order(self) -> bool: def additive_base(self) -> BigFrameNode: return self.child + @property + def _node_expressions(self): + return (self.expression, *self.window_spec.expressions) + def replace_additive_base(self, node: BigFrameNode) -> WindowOpNode: return dataclasses.replace(self, child=node) @@ -1533,6 +1568,10 @@ class ExplodeNode(UnaryNode): # Offsets are generated only if this is non-null offsets_col: Optional[identifiers.ColumnId] = None + def _validate(self): + for col in self.column_ids: + assert col.id in self.child.ids + @property def row_preserving(self) -> bool: return False @@ -1584,6 +1623,10 @@ def node_defined_ids(self) -> Tuple[identifiers.ColumnId, ...]: def referenced_ids(self) -> COLUMN_SET: return frozenset(ref.id for ref in self.column_ids) + @property + def _node_expressions(self): + return self.column_ids + def remap_vars( self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId] ) -> ExplodeNode: @@ -1607,6 +1650,10 @@ class ResultNode(UnaryNode): limit: Optional[int] = None # TODO: CTE definitions + def _validate(self): + for ref, name in self.output_cols: + assert ref.id in self.child.ids + @property def node_defined_ids(self) -> Tuple[identifiers.ColumnId, ...]: return () @@ -1657,6 +1704,10 @@ def row_count(self) -> Optional[int]: def variables_introduced(self) -> int: return 0 + @property + def _node_expressions(self): + return tuple(ref for ref, _ in self.output_cols) + # Tree operators def top_down( diff --git a/bigframes/core/rewrite/__init__.py b/bigframes/core/rewrite/__init__.py index 5d554d45d7..4e5295ae9d 100644 --- a/bigframes/core/rewrite/__init__.py +++ b/bigframes/core/rewrite/__init__.py @@ -22,6 +22,7 @@ try_reduce_to_local_scan, try_reduce_to_table_scan, ) +from bigframes.core.rewrite.select_pullup import defer_selection from bigframes.core.rewrite.slices import pull_out_limit, pull_up_limits, rewrite_slice from bigframes.core.rewrite.timedeltas import rewrite_timedelta_expressions from bigframes.core.rewrite.windows import pull_out_window_order, rewrite_range_rolling @@ -42,4 +43,5 @@ "try_reduce_to_local_scan", "fold_row_counts", "pull_out_window_order", + "defer_selection", ] diff --git a/bigframes/core/rewrite/pruning.py b/bigframes/core/rewrite/pruning.py index 1ecfb452ec..8a07f0b87e 100644 --- a/bigframes/core/rewrite/pruning.py +++ b/bigframes/core/rewrite/pruning.py @@ -13,7 +13,7 @@ # limitations under the License. import dataclasses import functools -from typing import AbstractSet +import typing from bigframes.core import identifiers, nodes @@ -143,7 +143,7 @@ def prune_selection_child( def prune_node( node: nodes.BigFrameNode, - ids: AbstractSet[identifiers.ColumnId], + ids: typing.AbstractSet[identifiers.ColumnId], ): # This clause is important, ensures idempotency, so can reach fixed point if not (set(node.ids) - ids): @@ -157,7 +157,7 @@ def prune_node( def prune_aggregate( node: nodes.AggregateNode, - used_cols: AbstractSet[identifiers.ColumnId], + used_cols: typing.AbstractSet[identifiers.ColumnId], ) -> nodes.AggregateNode: pruned_aggs = ( tuple(agg for agg in node.aggregations if agg[1] in used_cols) @@ -169,7 +169,7 @@ def prune_aggregate( @functools.singledispatch def prune_leaf( node: nodes.BigFrameNode, - used_cols: AbstractSet[identifiers.ColumnId], + used_cols: typing.AbstractSet[identifiers.ColumnId], ): ... @@ -177,7 +177,7 @@ def prune_leaf( @prune_leaf.register def prune_readlocal( node: nodes.ReadLocalNode, - selection: AbstractSet[identifiers.ColumnId], + selection: typing.AbstractSet[identifiers.ColumnId], ) -> nodes.ReadLocalNode: new_scan_list = node.scan_list.filter_cols(selection) return dataclasses.replace( @@ -190,7 +190,7 @@ def prune_readlocal( @prune_leaf.register def prune_readtable( node: nodes.ReadTableNode, - selection: AbstractSet[identifiers.ColumnId], + selection: typing.AbstractSet[identifiers.ColumnId], ) -> nodes.ReadTableNode: new_scan_list = node.scan_list.filter_cols(selection) return dataclasses.replace(node, scan_list=new_scan_list) diff --git a/bigframes/core/rewrite/select_pullup.py b/bigframes/core/rewrite/select_pullup.py new file mode 100644 index 0000000000..3a2de1238b --- /dev/null +++ b/bigframes/core/rewrite/select_pullup.py @@ -0,0 +1,144 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import dataclasses +from typing import cast + +from bigframes.core import expression, nodes + + +def defer_selection( + root: nodes.BigFrameNode, +) -> nodes.BigFrameNode: + """ + Defers SelectionNode operations in the tree, pulling them up. + + In many cases, these nodes will be merged or eliminated entirely, simplifying the overall tree. + """ + return nodes.bottom_up(root, pull_up_select) + + +def pull_up_select(node: nodes.BigFrameNode) -> nodes.BigFrameNode: + if isinstance(node, nodes.LeafNode): + return node + if isinstance(node, nodes.JoinNode): + return pull_up_selects_under_join(node) + if isinstance(node, nodes.ConcatNode): + return handle_selects_under_concat(node) + if isinstance(node, nodes.UnaryNode): + return pull_up_select_unary(node) + # shouldn't hit this, but not worth crashing over + return node + + +def pull_up_select_unary(node: nodes.UnaryNode) -> nodes.BigFrameNode: + child = node.child + if not isinstance(child, nodes.SelectionNode): + return node + + # Schema-preserving nodes + if isinstance( + node, + ( + nodes.ReversedNode, + nodes.OrderByNode, + nodes.SliceNode, + nodes.FilterNode, + nodes.RandomSampleNode, + ), + ): + pushed_down_node: nodes.BigFrameNode = node.remap_refs( + {id: ref.id for ref, id in child.input_output_pairs} + ).replace_child(child.child) + pulled_up_select = cast( + nodes.SelectionNode, child.replace_child(pushed_down_node) + ) + return pulled_up_select + elif isinstance( + node, + ( + nodes.SelectionNode, + nodes.ResultNode, + ), + ): + return node.remap_refs( + {id: ref.id for ref, id in child.input_output_pairs} + ).replace_child(child.child) + elif isinstance(node, nodes.AggregateNode): + pushed_down_agg = node.remap_refs( + {id: ref.id for ref, id in child.input_output_pairs} + ).replace_child(child.child) + new_selection = tuple( + nodes.AliasedRef.identity(id).remap_refs( + {id: ref.id for ref, id in child.input_output_pairs} + ) + for id in node.ids + ) + return nodes.SelectionNode(pushed_down_agg, new_selection) + elif isinstance(node, nodes.ExplodeNode): + pushed_down_node = node.remap_refs( + {id: ref.id for ref, id in child.input_output_pairs} + ).replace_child(child.child) + pulled_up_select = cast( + nodes.SelectionNode, child.replace_child(pushed_down_node) + ) + if node.offsets_col: + pulled_up_select = dataclasses.replace( + pulled_up_select, + input_output_pairs=( + *pulled_up_select.input_output_pairs, + nodes.AliasedRef( + expression.DerefOp(node.offsets_col), node.offsets_col + ), + ), + ) + return pulled_up_select + elif isinstance(node, nodes.AdditiveNode): + pushed_down_node = node.replace_additive_base(child.child).remap_refs( + {id: ref.id for ref, id in child.input_output_pairs} + ) + new_selection = ( + *child.input_output_pairs, + *( + nodes.AliasedRef(expression.DerefOp(col.id), col.id) + for col in node.added_fields + ), + ) + pulled_up_select = dataclasses.replace( + child, child=pushed_down_node, input_output_pairs=new_selection + ) + return pulled_up_select + # shouldn't hit this, but not worth crashing over + return node + + +def pull_up_selects_under_join(node: nodes.JoinNode) -> nodes.JoinNode: + # Can in theory pull up selects here, but it is a bit dangerous, in particular or self-joins, when there are more transforms to do. + # TODO: Safely pull up selects above join + return node + + +def handle_selects_under_concat(node: nodes.ConcatNode) -> nodes.ConcatNode: + new_children = [] + for child in node.child_nodes: + # remove select if no-op + if not isinstance(child, nodes.SelectionNode): + new_children.append(child) + else: + inputs = (ref.id for ref in child.input_output_pairs) + if inputs == tuple(child.child.ids): + new_children.append(child.child) + else: + new_children.append(child) + return dataclasses.replace(node, children=tuple(new_children)) diff --git a/bigframes/core/utils.py b/bigframes/core/utils.py index ba3fdcfd4b..dd37a352a7 100644 --- a/bigframes/core/utils.py +++ b/bigframes/core/utils.py @@ -148,6 +148,12 @@ def label_to_identifier(label: typing.Hashable, strict: bool = False) -> str: # first character must be letter or underscore identifier = "_" + identifier + else: + # Even with flexible column names, there are constraints + # Convert illegal characters + # See: https://cloud.google.com/bigquery/docs/schemas#flexible-column-names + identifier = re.sub(r"[!\"$\(\)\*\,\./;\?@[\]^`{}~]", "_", identifier) + # Except in special circumstances (true anonymous query results tables), # field names are not allowed to start with these (case-insensitive) # prefixes. diff --git a/bigframes/core/window_spec.py b/bigframes/core/window_spec.py index 2be30135ee..bef5fbea7c 100644 --- a/bigframes/core/window_spec.py +++ b/bigframes/core/window_spec.py @@ -16,7 +16,7 @@ from dataclasses import dataclass, replace import datetime import itertools -from typing import Literal, Mapping, Optional, Set, Tuple, Union +from typing import Literal, Mapping, Optional, Sequence, Set, Tuple, Union import numpy as np import pandas as pd @@ -260,6 +260,11 @@ def is_unbounded(self): self.bounds.start is None and self.bounds.end is None ) + @property + def expressions(self) -> Sequence[ex.Expression]: + ordering_exprs = (item.scalar_expression for item in self.ordering) + return (*self.grouping_keys, *ordering_exprs) + @property def all_referenced_columns(self) -> Set[ids.ColumnId]: """ diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 38879d3ec0..495e242f43 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -562,17 +562,6 @@ def select_dtypes(self, include=None, exclude=None) -> DataFrame: ) return DataFrame(self._block.select_columns(selected_columns)) - def _select_exact_dtypes( - self, dtypes: Sequence[bigframes.dtypes.Dtype] - ) -> DataFrame: - """Selects columns without considering inheritance relationships.""" - columns = [ - col_id - for col_id, dtype in zip(self._block.value_columns, self._block.dtypes) - if dtype in dtypes - ] - return DataFrame(self._block.select_columns(columns)) - def _set_internal_query_job(self, query_job: Optional[bigquery.QueryJob]): self._query_job = query_job @@ -736,7 +725,9 @@ def __repr__(self) -> str: opts = bigframes.options.display max_results = opts.max_rows - if opts.repr_mode == "deferred": + # anywdiget mode uses the same display logic as the "deferred" mode + # for faster execution + if opts.repr_mode in ("deferred", "anywidget"): return formatter.repr_query_job(self._compute_dry_run()) # TODO(swast): pass max_columns and get the true column count back. Maybe @@ -785,6 +776,23 @@ def _repr_html_(self) -> str: if opts.repr_mode == "deferred": return formatter.repr_query_job(self._compute_dry_run()) + if opts.repr_mode == "anywidget": + import anywidget # type: ignore + + # create an iterator for the data batches + batches = self.to_pandas_batches() + + # get the first page result + try: + first_page = next(iter(batches)) + except StopIteration: + first_page = pandas.DataFrame(columns=self.columns) + + # Instantiate and return the widget. The widget's frontend will + # handle the display of the table and pagination + return anywidget.AnyWidget(dataframe=first_page) + + self._cached() df = self.copy() if bigframes.options.display.blob_display: blob_cols = [ @@ -3079,92 +3087,9 @@ def melt( ) def describe(self, include: None | Literal["all"] = None) -> DataFrame: - if include is None: - numeric_df = self._select_exact_dtypes( - bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE - + bigframes.dtypes.TEMPORAL_NUMERIC_BIGFRAMES_TYPES - ) - if len(numeric_df.columns) == 0: - # Describe eligible non-numeric columns - return self._describe_non_numeric() - - # Otherwise, only describe numeric columns - return self._describe_numeric() - - elif include == "all": - numeric_result = self._describe_numeric() - non_numeric_result = self._describe_non_numeric() - - if len(numeric_result.columns) == 0: - return non_numeric_result - elif len(non_numeric_result.columns) == 0: - return numeric_result - else: - import bigframes.core.reshape.api as rs + from bigframes.pandas.core.methods import describe - # Use reindex after join to preserve the original column order. - return rs.concat( - [non_numeric_result, numeric_result], axis=1 - )._reindex_columns(self.columns) - - else: - raise ValueError(f"Unsupported include type: {include}") - - def _describe_numeric(self) -> DataFrame: - number_df_result = typing.cast( - DataFrame, - self._select_exact_dtypes( - bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE - ).agg( - [ - "count", - "mean", - "std", - "min", - "25%", - "50%", - "75%", - "max", - ] - ), - ) - temporal_df_result = typing.cast( - DataFrame, - self._select_exact_dtypes( - bigframes.dtypes.TEMPORAL_NUMERIC_BIGFRAMES_TYPES - ).agg(["count"]), - ) - - if len(number_df_result.columns) == 0: - return temporal_df_result - elif len(temporal_df_result.columns) == 0: - return number_df_result - else: - import bigframes.core.reshape.api as rs - - original_columns = self._select_exact_dtypes( - bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE - + bigframes.dtypes.TEMPORAL_NUMERIC_BIGFRAMES_TYPES - ).columns - - # Use reindex after join to preserve the original column order. - return rs.concat( - [number_df_result, temporal_df_result], - axis=1, - )._reindex_columns(original_columns) - - def _describe_non_numeric(self) -> DataFrame: - return typing.cast( - DataFrame, - self._select_exact_dtypes( - [ - bigframes.dtypes.STRING_DTYPE, - bigframes.dtypes.BOOL_DTYPE, - bigframes.dtypes.BYTES_DTYPE, - bigframes.dtypes.TIME_DTYPE, - ] - ).agg(["count", "nunique"]), - ) + return typing.cast(DataFrame, describe.describe(self, include)) def skew(self, *, numeric_only: bool = False): if not numeric_only: diff --git a/bigframes/exceptions.py b/bigframes/exceptions.py index 8924295c29..eda24a74f0 100644 --- a/bigframes/exceptions.py +++ b/bigframes/exceptions.py @@ -71,6 +71,10 @@ class OperationAbortedError(RuntimeError): """Operation is aborted.""" +class MaximumResultRowsExceeded(RuntimeError): + """Maximum number of rows in the result was exceeded.""" + + class TimeTravelDisabledWarning(Warning): """A query was reattempted without time travel.""" diff --git a/bigframes/ml/core.py b/bigframes/ml/core.py index e11f7d82ba..73b8ba8dbc 100644 --- a/bigframes/ml/core.py +++ b/bigframes/ml/core.py @@ -35,7 +35,17 @@ class BaseBqml: def __init__(self, session: bigframes.session.Session): self._session = session - self._base_sql_generator = ml_sql.BaseSqlGenerator() + self._sql_generator = ml_sql.BaseSqlGenerator() + + def ai_forecast( + self, + input_data: bpd.DataFrame, + options: Mapping[str, Union[str, int, float, Iterable[str]]], + ) -> bpd.DataFrame: + result_sql = self._sql_generator.ai_forecast( + source_sql=input_data.sql, options=options + ) + return self._session.read_gbq(result_sql) class BqmlModel(BaseBqml): @@ -55,8 +65,8 @@ def __init__(self, session: bigframes.Session, model: bigquery.Model): self._model = model model_ref = self._model.reference assert model_ref is not None - self._model_manipulation_sql_generator = ml_sql.ModelManipulationSqlGenerator( - model_ref + self._sql_generator: ml_sql.ModelManipulationSqlGenerator = ( + ml_sql.ModelManipulationSqlGenerator(model_ref) ) def _apply_ml_tvf( @@ -126,13 +136,13 @@ def model(self) -> bigquery.Model: def recommend(self, input_data: bpd.DataFrame) -> bpd.DataFrame: return self._apply_ml_tvf( input_data, - self._model_manipulation_sql_generator.ml_recommend, + self._sql_generator.ml_recommend, ) def predict(self, input_data: bpd.DataFrame) -> bpd.DataFrame: return self._apply_ml_tvf( input_data, - self._model_manipulation_sql_generator.ml_predict, + self._sql_generator.ml_predict, ) def explain_predict( @@ -140,16 +150,14 @@ def explain_predict( ) -> bpd.DataFrame: return self._apply_ml_tvf( input_data, - lambda source_sql: self._model_manipulation_sql_generator.ml_explain_predict( + lambda source_sql: self._sql_generator.ml_explain_predict( source_sql=source_sql, struct_options=options, ), ) def global_explain(self, options: Mapping[str, bool]) -> bpd.DataFrame: - sql = self._model_manipulation_sql_generator.ml_global_explain( - struct_options=options - ) + sql = self._sql_generator.ml_global_explain(struct_options=options) return ( self._session.read_gbq(sql) .sort_values(by="attribution", ascending=False) @@ -159,7 +167,7 @@ def global_explain(self, options: Mapping[str, bool]) -> bpd.DataFrame: def transform(self, input_data: bpd.DataFrame) -> bpd.DataFrame: return self._apply_ml_tvf( input_data, - self._model_manipulation_sql_generator.ml_transform, + self._sql_generator.ml_transform, ) def generate_text( @@ -170,7 +178,7 @@ def generate_text( options["flatten_json_output"] = True return self._apply_ml_tvf( input_data, - lambda source_sql: self._model_manipulation_sql_generator.ml_generate_text( + lambda source_sql: self._sql_generator.ml_generate_text( source_sql=source_sql, struct_options=options, ), @@ -186,7 +194,7 @@ def generate_embedding( options["flatten_json_output"] = True return self._apply_ml_tvf( input_data, - lambda source_sql: self._model_manipulation_sql_generator.ml_generate_embedding( + lambda source_sql: self._sql_generator.ml_generate_embedding( source_sql=source_sql, struct_options=options, ), @@ -201,7 +209,7 @@ def generate_table( ) -> bpd.DataFrame: return self._apply_ml_tvf( input_data, - lambda source_sql: self._model_manipulation_sql_generator.ai_generate_table( + lambda source_sql: self._sql_generator.ai_generate_table( source_sql=source_sql, struct_options=options, ), @@ -216,14 +224,14 @@ def detect_anomalies( return self._apply_ml_tvf( input_data, - lambda source_sql: self._model_manipulation_sql_generator.ml_detect_anomalies( + lambda source_sql: self._sql_generator.ml_detect_anomalies( source_sql=source_sql, struct_options=options, ), ) def forecast(self, options: Mapping[str, int | float]) -> bpd.DataFrame: - sql = self._model_manipulation_sql_generator.ml_forecast(struct_options=options) + sql = self._sql_generator.ml_forecast(struct_options=options) timestamp_col_name = "forecast_timestamp" index_cols = [timestamp_col_name] first_col_name = self._session.read_gbq(sql).columns.values[0] @@ -232,9 +240,7 @@ def forecast(self, options: Mapping[str, int | float]) -> bpd.DataFrame: return self._session.read_gbq(sql, index_col=index_cols).reset_index() def explain_forecast(self, options: Mapping[str, int | float]) -> bpd.DataFrame: - sql = self._model_manipulation_sql_generator.ml_explain_forecast( - struct_options=options - ) + sql = self._sql_generator.ml_explain_forecast(struct_options=options) timestamp_col_name = "time_series_timestamp" index_cols = [timestamp_col_name] first_col_name = self._session.read_gbq(sql).columns.values[0] @@ -243,7 +249,7 @@ def explain_forecast(self, options: Mapping[str, int | float]) -> bpd.DataFrame: return self._session.read_gbq(sql, index_col=index_cols).reset_index() def evaluate(self, input_data: Optional[bpd.DataFrame] = None): - sql = self._model_manipulation_sql_generator.ml_evaluate( + sql = self._sql_generator.ml_evaluate( input_data.sql if (input_data is not None) else None ) @@ -254,28 +260,24 @@ def llm_evaluate( input_data: bpd.DataFrame, task_type: Optional[str] = None, ): - sql = self._model_manipulation_sql_generator.ml_llm_evaluate( - input_data.sql, task_type - ) + sql = self._sql_generator.ml_llm_evaluate(input_data.sql, task_type) return self._session.read_gbq(sql) def arima_evaluate(self, show_all_candidate_models: bool = False): - sql = self._model_manipulation_sql_generator.ml_arima_evaluate( - show_all_candidate_models - ) + sql = self._sql_generator.ml_arima_evaluate(show_all_candidate_models) return self._session.read_gbq(sql) def arima_coefficients(self) -> bpd.DataFrame: - sql = self._model_manipulation_sql_generator.ml_arima_coefficients() + sql = self._sql_generator.ml_arima_coefficients() return self._session.read_gbq(sql) def centroids(self) -> bpd.DataFrame: assert self._model.model_type == "KMEANS" - sql = self._model_manipulation_sql_generator.ml_centroids() + sql = self._sql_generator.ml_centroids() return self._session.read_gbq( sql, index_col=["centroid_id", "feature"] @@ -284,7 +286,7 @@ def centroids(self) -> bpd.DataFrame: def principal_components(self) -> bpd.DataFrame: assert self._model.model_type == "PCA" - sql = self._model_manipulation_sql_generator.ml_principal_components() + sql = self._sql_generator.ml_principal_components() return self._session.read_gbq( sql, index_col=["principal_component_id", "feature"] @@ -293,7 +295,7 @@ def principal_components(self) -> bpd.DataFrame: def principal_component_info(self) -> bpd.DataFrame: assert self._model.model_type == "PCA" - sql = self._model_manipulation_sql_generator.ml_principal_component_info() + sql = self._sql_generator.ml_principal_component_info() return self._session.read_gbq(sql) @@ -319,7 +321,7 @@ def register(self, vertex_ai_model_id: Optional[str] = None) -> BqmlModel: # truncate as Vertex ID only accepts 63 characters, easily exceeding the limit for temp models. # The possibility of conflicts should be low. vertex_ai_model_id = vertex_ai_model_id[:63] - sql = self._model_manipulation_sql_generator.alter_model( + sql = self._sql_generator.alter_model( options={"vertex_ai_model_id": vertex_ai_model_id} ) # Register the model and wait it to finish diff --git a/bigframes/ml/sql.py b/bigframes/ml/sql.py index 84ea37c5fc..2937368c92 100644 --- a/bigframes/ml/sql.py +++ b/bigframes/ml/sql.py @@ -49,6 +49,12 @@ def build_parameters(self, **kwargs: Union[str, int, float, Iterable[str]]) -> s param_strs = [f"{k}={self.encode_value(v)}" for k, v in kwargs.items()] return "\n" + INDENT_STR + f",\n{INDENT_STR}".join(param_strs) + def build_named_parameters( + self, **kwargs: Union[str, int, float, Iterable[str]] + ) -> str: + param_strs = [f"{k} => {self.encode_value(v)}" for k, v in kwargs.items()] + return "\n" + INDENT_STR + f",\n{INDENT_STR}".join(param_strs) + def build_structs(self, **kwargs: Union[int, float, str, Mapping]) -> str: """Encode a dict of values into a formatted STRUCT items for SQL""" param_strs = [] @@ -187,6 +193,17 @@ def ml_distance( https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-distance""" return f"""SELECT *, ML.DISTANCE({sql_utils.identifier(col_x)}, {sql_utils.identifier(col_y)}, '{type}') AS {sql_utils.identifier(name)} FROM ({source_sql})""" + def ai_forecast( + self, + source_sql: str, + options: Mapping[str, Union[int, float, bool, Iterable[str]]], + ): + """Encode AI.FORECAST. + https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-ai-forecast""" + named_parameters_sql = self.build_named_parameters(**options) + + return f"""SELECT * FROM AI.FORECAST(({source_sql}),{named_parameters_sql})""" + class ModelCreationSqlGenerator(BaseSqlGenerator): """Sql generator for creating a model entity. Model id is the standalone id without project id and dataset id.""" diff --git a/bigframes/operations/ai.py b/bigframes/operations/ai.py index 30192695ac..10c842c64c 100644 --- a/bigframes/operations/ai.py +++ b/bigframes/operations/ai.py @@ -16,25 +16,24 @@ import re import typing -from typing import Dict, List, Optional, Sequence +from typing import Dict, Iterable, List, Optional, Sequence, Union import warnings import numpy as np -from bigframes import dtypes, exceptions +from bigframes import dtypes, exceptions, options from bigframes.core import guid, log_adapter @log_adapter.class_logger class AIAccessor: - def __init__(self, df) -> None: + def __init__(self, df, base_bqml=None) -> None: import bigframes # Import in the function body to avoid circular imports. import bigframes.dataframe - - if not bigframes.options.experiments.ai_operators: - raise NotImplementedError() + from bigframes.ml import core as ml_core self._df: bigframes.dataframe.DataFrame = df + self._base_bqml: ml_core.BaseBqml = base_bqml or ml_core.BaseBqml(df._session) def filter( self, @@ -89,6 +88,8 @@ def filter( ValueError: when the instruction refers to a non-existing column, or when no columns are referred to. """ + if not options.experiments.ai_operators: + raise NotImplementedError() answer_col = "answer" @@ -181,6 +182,9 @@ def map( ValueError: when the instruction refers to a non-existing column, or when no columns are referred to. """ + if not options.experiments.ai_operators: + raise NotImplementedError() + import bigframes.dataframe import bigframes.series @@ -320,6 +324,8 @@ def classify( columns are referred to, or when the count of labels does not meet the requirement. """ + if not options.experiments.ai_operators: + raise NotImplementedError() if len(labels) < 2 or len(labels) > 20: raise ValueError( @@ -401,6 +407,9 @@ def join( Raises: ValueError if the amount of data that will be sent for LLM processing is larger than max_rows. """ + if not options.experiments.ai_operators: + raise NotImplementedError() + self._validate_model(model) columns = self._parse_columns(instruction) @@ -525,6 +534,8 @@ def search( ValueError: when the search_column is not found from the the data frame. TypeError: when the provided model is not TextEmbeddingGenerator. """ + if not options.experiments.ai_operators: + raise NotImplementedError() if search_column not in self._df.columns: raise ValueError(f"Column `{search_column}` not found") @@ -640,6 +651,9 @@ def top_k( ValueError: when the instruction refers to a non-existing column, or when no columns are referred to. """ + if not options.experiments.ai_operators: + raise NotImplementedError() + import bigframes.dataframe import bigframes.series @@ -834,6 +848,8 @@ def sim_join( Raises: ValueError: when the amount of data to be processed exceeds the specified max_rows. """ + if not options.experiments.ai_operators: + raise NotImplementedError() if left_on not in self._df.columns: raise ValueError(f"Left column {left_on} not found") @@ -883,6 +899,73 @@ def sim_join( return join_result + def forecast( + self, + timestamp_column: str, + data_column: str, + *, + model: str = "TimesFM 2.0", + id_columns: Optional[Iterable[str]] = None, + horizon: int = 10, + confidence_level: float = 0.95, + ): + """ + Forecast time series at future horizon. Using Google Research's open source TimesFM(https://github.com/google-research/timesfm) model. + + .. note:: + + This product or feature is subject to the "Pre-GA Offerings Terms" in the General Service Terms section of the + Service Specific Terms(https://cloud.google.com/terms/service-terms#1). Pre-GA products and features are available "as is" + and might have limited support. For more information, see the launch stage descriptions + (https://cloud.google.com/products#product-launch-stages). + + Args: + timestamp_column (str): + A str value that specified the name of the time points column. + The time points column provides the time points used to generate the forecast. + The time points column must use one of the following data types: TIMESTAMP, DATE and DATETIME + data_column (str): + A str value that specifies the name of the data column. The data column contains the data to forecast. + The data column must use one of the following data types: INT64, NUMERIC and FLOAT64 + model (str, default "TimesFM 2.0"): + A str value that specifies the name of the model. TimesFM 2.0 is the only supported value, and is the default value. + id_columns (Iterable[str] or None, default None): + An iterable of str value that specifies the names of one or more ID columns. Each ID identifies a unique time series to forecast. + Specify one or more values for this argument in order to forecast multiple time series using a single query. + The columns that you specify must use one of the following data types: STRING, INT64, ARRAY and ARRAY + horizon (int, default 10): + An int value that specifies the number of time points to forecast. The default value is 10. The valid input range is [1, 10,000]. + confidence_level (float, default 0.95): + A FLOAT64 value that specifies the percentage of the future values that fall in the prediction interval. + The default value is 0.95. The valid input range is [0, 1). + + Returns: + DataFrame: + The forecast dataframe matches that of the BigQuery AI.FORECAST function. + See: https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-ai-forecast + + Raises: + ValueError: when referring to a non-existing column. + """ + columns = [timestamp_column, data_column] + if id_columns: + columns += id_columns + for column in columns: + if column not in self._df.columns: + raise ValueError(f"Column `{column}` not found") + + options: dict[str, Union[int, float, str, Iterable[str]]] = { + "data_col": data_column, + "timestamp_col": timestamp_column, + "model": model, + "horizon": horizon, + "confidence_level": confidence_level, + } + if id_columns: + options["id_cols"] = id_columns + + return self._base_bqml.ai_forecast(input_data=self._df, options=options) + @staticmethod def _attach_embedding(dataframe, source_column: str, embedding_column: str, model): result_df = dataframe.copy() diff --git a/bigframes/operations/blob.py b/bigframes/operations/blob.py index e143cfc519..63875ded99 100644 --- a/bigframes/operations/blob.py +++ b/bigframes/operations/blob.py @@ -303,6 +303,7 @@ def get_runtime_json_str( def exif( self, *, + engine: Literal[None, "pillow"] = None, connection: Optional[str] = None, max_batching_rows: int = 8192, container_cpu: Union[float, int] = 0.33, @@ -311,6 +312,7 @@ def exif( """Extract EXIF data. Now only support image types. Args: + engine ('pillow' or None, default None): The engine (bigquery or third party library) used for the function. The value must be specified. connection (str or None, default None): BQ connection used for function internet transactions, and the output blob if "dst" is str. If None, uses default connection of the session. max_batching_rows (int, default 8,192): Max number of rows per batch send to cloud run to execute the function. container_cpu (int or float, default 0.33): number of container CPUs. Possible values are [0.33, 8]. Floats larger than 1 are cast to intergers. @@ -319,6 +321,8 @@ def exif( Returns: bigframes.series.Series: JSON series of key-value pairs. """ + if engine is None or engine.casefold() != "pillow": + raise ValueError("Must specify the engine, supported value is 'pillow'.") import bigframes.bigquery as bbq import bigframes.blob._functions as blob_func @@ -344,6 +348,7 @@ def image_blur( self, ksize: tuple[int, int], *, + engine: Literal[None, "opencv"] = None, dst: Optional[Union[str, bigframes.series.Series]] = None, connection: Optional[str] = None, max_batching_rows: int = 8192, @@ -354,6 +359,7 @@ def image_blur( Args: ksize (tuple(int, int)): Kernel size. + engine ('opencv' or None, default None): The engine (bigquery or third party library) used for the function. The value must be specified. dst (str or bigframes.series.Series or None, default None): Output destination. Can be one of: str: GCS folder str. The output filenames are the same as the input files. blob Series: The output file paths are determined by the uris of the blob Series. @@ -367,6 +373,9 @@ def image_blur( Returns: bigframes.series.Series: blob Series if destination is GCS. Or bytes Series if destination is BQ. """ + if engine is None or engine.casefold() != "opencv": + raise ValueError("Must specify the engine, supported value is 'opencv'.") + import bigframes.blob._functions as blob_func connection = self._resolve_connection(connection) @@ -424,6 +433,7 @@ def image_resize( self, dsize: tuple[int, int] = (0, 0), *, + engine: Literal[None, "opencv"] = None, fx: float = 0.0, fy: float = 0.0, dst: Optional[Union[str, bigframes.series.Series]] = None, @@ -436,6 +446,7 @@ def image_resize( Args: dsize (tuple(int, int), default (0, 0)): Destination size. If set to 0, fx and fy parameters determine the size. + engine ('opencv' or None, default None): The engine (bigquery or third party library) used for the function. The value must be specified. fx (float, default 0.0): scale factor along the horizontal axis. If set to 0.0, dsize parameter determines the output size. fy (float, defalut 0.0): scale factor along the vertical axis. If set to 0.0, dsize parameter determines the output size. dst (str or bigframes.series.Series or None, default None): Output destination. Can be one of: @@ -451,6 +462,9 @@ def image_resize( Returns: bigframes.series.Series: blob Series if destination is GCS. Or bytes Series if destination is BQ. """ + if engine is None or engine.casefold() != "opencv": + raise ValueError("Must specify the engine, supported value is 'opencv'.") + dsize_set = dsize[0] > 0 and dsize[1] > 0 fsize_set = fx > 0.0 and fy > 0.0 if not dsize_set ^ fsize_set: @@ -516,6 +530,7 @@ def image_resize( def image_normalize( self, *, + engine: Literal[None, "opencv"] = None, alpha: float = 1.0, beta: float = 0.0, norm_type: str = "l2", @@ -528,6 +543,7 @@ def image_normalize( """Normalize images. Args: + engine ('opencv' or None, default None): The engine (bigquery or third party library) used for the function. The value must be specified. alpha (float, default 1.0): Norm value to normalize to or the lower range boundary in case of the range normalization. beta (float, default 0.0): Upper range boundary in case of the range normalization; it is not used for the norm normalization. norm_type (str, default "l2"): Normalization type. Accepted values are "inf", "l1", "l2" and "minmax". @@ -544,6 +560,9 @@ def image_normalize( Returns: bigframes.series.Series: blob Series if destination is GCS. Or bytes Series if destination is BQ. """ + if engine is None or engine.casefold() != "opencv": + raise ValueError("Must specify the engine, supported value is 'opencv'.") + import bigframes.blob._functions as blob_func connection = self._resolve_connection(connection) @@ -604,6 +623,7 @@ def image_normalize( def pdf_extract( self, *, + engine: Literal[None, "pypdf"] = None, connection: Optional[str] = None, max_batching_rows: int = 1, container_cpu: Union[float, int] = 2, @@ -613,6 +633,7 @@ def pdf_extract( """Extracts text from PDF URLs and saves the text as string. Args: + engine ('pypdf' or None, default None): The engine (bigquery or third party library) used for the function. The value must be specified. connection (str or None, default None): BQ connection used for function internet transactions, and the output blob if "dst" is str. If None, uses default connection of the session. @@ -631,6 +652,9 @@ def pdf_extract( Contains the extracted text from the PDF file. Includes error messages if verbosity is enabled. """ + if engine is None or engine.casefold() != "pypdf": + raise ValueError("Must specify the engine, supported value is 'pypdf'.") + import bigframes.bigquery as bbq import bigframes.blob._functions as blob_func import bigframes.pandas as bpd @@ -663,6 +687,7 @@ def pdf_extract( def pdf_chunk( self, *, + engine: Literal[None, "pypdf"] = None, connection: Optional[str] = None, chunk_size: int = 2000, overlap_size: int = 200, @@ -675,6 +700,7 @@ def pdf_chunk( arrays of strings. Args: + engine ('pypdf' or None, default None): The engine (bigquery or third party library) used for the function. The value must be specified. connection (str or None, default None): BQ connection used for function internet transactions, and the output blob if "dst" is str. If None, uses default connection of the session. @@ -698,6 +724,8 @@ def pdf_chunk( where each string is a chunk of text extracted from PDF. Includes error messages if verbosity is enabled. """ + if engine is None or engine.casefold() != "pypdf": + raise ValueError("Must specify the engine, supported value is 'pypdf'.") import bigframes.bigquery as bbq import bigframes.blob._functions as blob_func @@ -740,6 +768,7 @@ def pdf_chunk( def audio_transcribe( self, *, + engine: Literal["bigquery"] = "bigquery", connection: Optional[str] = None, model_name: Optional[ Literal[ @@ -753,6 +782,7 @@ def audio_transcribe( Transcribe audio content using a Gemini multimodal model. Args: + engine ('bigquery'): The engine (bigquery or third party library) used for the function. connection (str or None, default None): BQ connection used for function internet transactions, and the output blob if "dst" is str. If None, uses default connection of the session. @@ -770,6 +800,9 @@ def audio_transcribe( Contains the transcribed text from the audio file. Includes error messages if verbosity is enabled. """ + if engine.casefold() != "bigquery": + raise ValueError("Must specify the engine, supported value is 'bigquery'.") + import bigframes.bigquery as bbq import bigframes.ml.llm as llm import bigframes.pandas as bpd diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index e8253769be..a9d1c31865 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -38,6 +38,7 @@ import bigframes.functions._utils as bff_utils from bigframes.pandas.core.api import to_timedelta from bigframes.pandas.io.api import ( + _read_gbq_colab, from_glob_path, read_csv, read_gbq, @@ -335,6 +336,7 @@ def reset_session(): qcut, read_csv, read_gbq, + _read_gbq_colab, read_gbq_function, read_gbq_model, read_gbq_object_table, diff --git a/tests/unit/core/compile/sqlglot/test_compile_projection.py b/bigframes/pandas/core/methods/__init__.py similarity index 63% rename from tests/unit/core/compile/sqlglot/test_compile_projection.py rename to bigframes/pandas/core/methods/__init__.py index 82e6c60668..0a2669d7a2 100644 --- a/tests/unit/core/compile/sqlglot/test_compile_projection.py +++ b/bigframes/pandas/core/methods/__init__.py @@ -11,15 +11,3 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - -import pytest - -import bigframes - -pytest.importorskip("pytest_snapshot") - - -def test_compile_projection(compiler_session: bigframes.Session, snapshot): - bf_df = compiler_session.read_gbq_table("test-project.test_dataset.test_table") - bf_df["int64_col"] = bf_df["int64_col"] + 1 - snapshot.assert_match(bf_df.sql, "out.sql") diff --git a/bigframes/pandas/core/methods/describe.py b/bigframes/pandas/core/methods/describe.py new file mode 100644 index 0000000000..18d2318379 --- /dev/null +++ b/bigframes/pandas/core/methods/describe.py @@ -0,0 +1,129 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import typing + +from bigframes import dataframe, dtypes, series +from bigframes.core.reshape import api as rs + + +def describe( + input: dataframe.DataFrame | series.Series, + include: None | typing.Literal["all"], +) -> dataframe.DataFrame | series.Series: + if isinstance(input, series.Series): + # Convert the series to a dataframe, describe it, and cast the result back to a series. + return series.Series(describe(input.to_frame(), include)._block) + elif not isinstance(input, dataframe.DataFrame): + raise TypeError(f"Unsupported type: {type(input)}") + + if include is None: + numeric_df = _select_dtypes( + input, + dtypes.NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE + + dtypes.TEMPORAL_NUMERIC_BIGFRAMES_TYPES, + ) + if len(numeric_df.columns) == 0: + # Describe eligible non-numeric columns + return _describe_non_numeric(input) + + # Otherwise, only describe numeric columns + return _describe_numeric(input) + + elif include == "all": + numeric_result = _describe_numeric(input) + non_numeric_result = _describe_non_numeric(input) + + if len(numeric_result.columns) == 0: + return non_numeric_result + elif len(non_numeric_result.columns) == 0: + return numeric_result + else: + # Use reindex after join to preserve the original column order. + return rs.concat( + [non_numeric_result, numeric_result], axis=1 + )._reindex_columns(input.columns) + + else: + raise ValueError(f"Unsupported include type: {include}") + + +def _describe_numeric(df: dataframe.DataFrame) -> dataframe.DataFrame: + number_df_result = typing.cast( + dataframe.DataFrame, + _select_dtypes(df, dtypes.NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE).agg( + [ + "count", + "mean", + "std", + "min", + "25%", + "50%", + "75%", + "max", + ] + ), + ) + temporal_df_result = typing.cast( + dataframe.DataFrame, + _select_dtypes(df, dtypes.TEMPORAL_NUMERIC_BIGFRAMES_TYPES).agg(["count"]), + ) + + if len(number_df_result.columns) == 0: + return temporal_df_result + elif len(temporal_df_result.columns) == 0: + return number_df_result + else: + import bigframes.core.reshape.api as rs + + original_columns = _select_dtypes( + df, + dtypes.NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE + + dtypes.TEMPORAL_NUMERIC_BIGFRAMES_TYPES, + ).columns + + # Use reindex after join to preserve the original column order. + return rs.concat( + [number_df_result, temporal_df_result], + axis=1, + )._reindex_columns(original_columns) + + +def _describe_non_numeric(df: dataframe.DataFrame) -> dataframe.DataFrame: + return typing.cast( + dataframe.DataFrame, + _select_dtypes( + df, + [ + dtypes.STRING_DTYPE, + dtypes.BOOL_DTYPE, + dtypes.BYTES_DTYPE, + dtypes.TIME_DTYPE, + ], + ).agg(["count", "nunique"]), + ) + + +def _select_dtypes( + df: dataframe.DataFrame, dtypes: typing.Sequence[dtypes.Dtype] +) -> dataframe.DataFrame: + """Selects columns without considering inheritance relationships.""" + columns = [ + col_id + for col_id, dtype in zip(df._block.value_columns, df._block.dtypes) + if dtype in dtypes + ] + return dataframe.DataFrame(df._block.select_columns(columns)) diff --git a/bigframes/series.py b/bigframes/series.py index 7a318c4c70..ae6cd7b2ad 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -430,7 +430,9 @@ def __repr__(self) -> str: # metadata, like we do with DataFrame. opts = bigframes.options.display max_results = opts.max_rows - if opts.repr_mode == "deferred": + # anywdiget mode uses the same display logic as the "deferred" mode + # for faster execution + if opts.repr_mode in ("deferred", "anywidget"): return formatter.repr_query_job(self._compute_dry_run()) self._cached() @@ -1293,6 +1295,11 @@ def agg(self, func: str | typing.Sequence[str]) -> scalars.Scalar | Series: aggregate = agg aggregate.__doc__ = inspect.getdoc(vendored_pandas_series.Series.agg) + def describe(self) -> Series: + from bigframes.pandas.core.methods import describe + + return cast(Series, describe.describe(self, include="all")) + def skew(self): count = self.count() if count < 3: diff --git a/bigframes/session/bq_caching_executor.py b/bigframes/session/bq_caching_executor.py index 47be6fa768..9ad8da33a8 100644 --- a/bigframes/session/bq_caching_executor.py +++ b/bigframes/session/bq_caching_executor.py @@ -28,6 +28,8 @@ import google.cloud.bigquery.table as bq_table import google.cloud.bigquery_storage_v1 +import bigframes +from bigframes import exceptions as bfe import bigframes.constants import bigframes.core from bigframes.core import compile, local_data, rewrite @@ -38,7 +40,6 @@ import bigframes.core.schema as schemata import bigframes.core.tree_properties as tree_properties import bigframes.dtypes -import bigframes.exceptions as bfe import bigframes.features from bigframes.session import executor, loader, local_scan_executor, read_api_execution import bigframes.session._io.bigquery as bq_io @@ -415,7 +416,7 @@ def _run_execute_query( # Unfortunately, this error type does not have a separate error code or exception type if "Resources exceeded during query execution" in e.message: new_message = "Computation is too complex to execute as a single query. Try using DataFrame.cache() on intermediate results, or setting bigframes.options.compute.enable_multi_query_execution." - raise bigframes.exceptions.QueryComplexityError(new_message) from e + raise bfe.QueryComplexityError(new_message) from e else: raise @@ -688,7 +689,7 @@ def _execute_plan( ) return executor.ExecuteResult( - arrow_batches=iterator.to_arrow_iterable( + _arrow_batches=iterator.to_arrow_iterable( bqstorage_client=self.bqstoragereadclient ), schema=plan.schema, diff --git a/bigframes/session/direct_gbq_execution.py b/bigframes/session/direct_gbq_execution.py index 4b19f7441d..1d46192ac3 100644 --- a/bigframes/session/direct_gbq_execution.py +++ b/bigframes/session/direct_gbq_execution.py @@ -50,7 +50,7 @@ def execute( ) return executor.ExecuteResult( - arrow_batches=iterator.to_arrow_iterable(), + _arrow_batches=iterator.to_arrow_iterable(), schema=plan.schema, query_job=query_job, total_rows=iterator.total_rows, diff --git a/bigframes/session/executor.py b/bigframes/session/executor.py index ee1218017b..c913f39791 100644 --- a/bigframes/session/executor.py +++ b/bigframes/session/executor.py @@ -24,20 +24,46 @@ import pandas as pd import pyarrow +import bigframes import bigframes.core from bigframes.core import pyarrow_utils import bigframes.core.schema import bigframes.session._io.pandas as io_pandas +_ROW_LIMIT_EXCEEDED_TEMPLATE = ( + "Execution has downloaded {result_rows} rows so far, which exceeds the " + "limit of {maximum_result_rows}. You can adjust this limit by setting " + "`bpd.options.compute.maximum_result_rows`." +) + @dataclasses.dataclass(frozen=True) class ExecuteResult: - arrow_batches: Iterator[pyarrow.RecordBatch] + _arrow_batches: Iterator[pyarrow.RecordBatch] schema: bigframes.core.schema.ArraySchema query_job: Optional[bigquery.QueryJob] = None total_bytes: Optional[int] = None total_rows: Optional[int] = None + @property + def arrow_batches(self) -> Iterator[pyarrow.RecordBatch]: + result_rows = 0 + + for batch in self._arrow_batches: + result_rows += batch.num_rows + + maximum_result_rows = bigframes.options.compute.maximum_result_rows + if maximum_result_rows is not None and result_rows > maximum_result_rows: + message = bigframes.exceptions.format_message( + _ROW_LIMIT_EXCEEDED_TEMPLATE.format( + result_rows=result_rows, + maximum_result_rows=maximum_result_rows, + ) + ) + raise bigframes.exceptions.MaximumResultRowsExceeded(message) + + yield batch + def to_arrow_table(self) -> pyarrow.Table: # Need to provide schema if no result rows, as arrow can't infer # If ther are rows, it is safest to infer schema from batches. diff --git a/bigframes/session/local_scan_executor.py b/bigframes/session/local_scan_executor.py index b4d7b226e2..65f088e8a1 100644 --- a/bigframes/session/local_scan_executor.py +++ b/bigframes/session/local_scan_executor.py @@ -58,7 +58,7 @@ def execute( total_rows = min(peek, total_rows) return executor.ExecuteResult( - arrow_batches=arrow_table.to_batches(), + _arrow_batches=arrow_table.to_batches(), schema=plan.schema, query_job=None, total_bytes=None, diff --git a/bigframes/session/polars_executor.py b/bigframes/session/polars_executor.py index e215866874..6e3e15499d 100644 --- a/bigframes/session/polars_executor.py +++ b/bigframes/session/polars_executor.py @@ -13,26 +13,46 @@ # limitations under the License. from __future__ import annotations +import itertools from typing import Optional, TYPE_CHECKING import pyarrow as pa -from bigframes.core import array_value, bigframe_node, local_data, nodes +from bigframes.core import array_value, bigframe_node, expression, local_data, nodes +import bigframes.operations from bigframes.session import executor, semi_executor if TYPE_CHECKING: import polars as pl - +# Polars executor can execute more node types, but these are the validated ones _COMPATIBLE_NODES = ( nodes.ReadLocalNode, nodes.OrderByNode, nodes.ReversedNode, nodes.SelectionNode, - nodes.FilterNode, # partial support - nodes.ProjectionNode, # partial support ) +_COMPATIBLE_SCALAR_OPS = () + + +def _get_expr_ops(expr: expression.Expression) -> set[bigframes.operations.ScalarOp]: + if isinstance(expr, expression.OpExpression): + return set(itertools.chain.from_iterable(map(_get_expr_ops, expr.children))) + return set() + + +def _is_node_polars_executable(node: nodes.BigFrameNode): + if not isinstance(node, _COMPATIBLE_NODES): + return False + for expr in node._node_expressions: + if isinstance(expr, expression.Aggregation): + return False + if isinstance(expr, expression.Expression): + if not _get_expr_ops(expr).issubset(_COMPATIBLE_SCALAR_OPS): + return False + return True + class PolarsExecutor(semi_executor.SemiExecutor): def __init__(self): @@ -60,14 +80,14 @@ def execute( lazy_frame = lazy_frame.limit(peek) pa_table = lazy_frame.collect().to_arrow() return executor.ExecuteResult( - arrow_batches=iter(map(self._adapt_batch, pa_table.to_batches())), + _arrow_batches=iter(map(self._adapt_batch, pa_table.to_batches())), schema=plan.schema, total_bytes=pa_table.nbytes, total_rows=pa_table.num_rows, ) def _can_execute(self, plan: bigframe_node.BigFrameNode): - return all(isinstance(node, _COMPATIBLE_NODES) for node in plan.unique_nodes()) + return all(_is_node_polars_executable(node) for node in plan.unique_nodes()) def _adapt_array(self, array: pa.Array) -> pa.Array: target_type = local_data.logical_type_replacements(array.type) diff --git a/bigframes/session/read_api_execution.py b/bigframes/session/read_api_execution.py index d4bbf2783c..d5bcf1dbc7 100644 --- a/bigframes/session/read_api_execution.py +++ b/bigframes/session/read_api_execution.py @@ -111,7 +111,7 @@ def process_page(page): rows = min(peek, rows) return executor.ExecuteResult( - arrow_batches=batches, + _arrow_batches=batches, schema=plan.schema, query_job=None, total_bytes=None, diff --git a/bigframes/testing/mocks.py b/bigframes/testing/mocks.py index 25f1f90fe7..8d9997b1df 100644 --- a/bigframes/testing/mocks.py +++ b/bigframes/testing/mocks.py @@ -41,6 +41,7 @@ def create_bigquery_session( bqclient: Optional[mock.Mock] = None, session_id: str = "abcxyz", table_schema: Sequence[google.cloud.bigquery.SchemaField] = TEST_SCHEMA, + table_name: str = "test_table", anonymous_dataset: Optional[google.cloud.bigquery.DatasetReference] = None, location: str = "test-region", ordering_mode: Literal["strict", "partial"] = "partial", @@ -76,7 +77,7 @@ def create_bigquery_session( type(table).schema = mock.PropertyMock(return_value=table_schema) type(table).project = anonymous_dataset.project type(table).dataset_id = anonymous_dataset.dataset_id - type(table).table_id = "test_table" + type(table).table_id = table_name type(table).num_rows = mock.PropertyMock(return_value=1000000000) bqclient.get_table.return_value = table @@ -94,7 +95,7 @@ def query_mock( query_job = mock.create_autospec(google.cloud.bigquery.QueryJob, instance=True) query_job._properties = {} type(query_job).destination = mock.PropertyMock( - return_value=anonymous_dataset.table("test_table"), + return_value=anonymous_dataset.table(table_name), ) type(query_job).statement_type = mock.PropertyMock(return_value="SELECT") diff --git a/bigframes/testing/polars_session.py b/bigframes/testing/polars_session.py index 723841a672..7b898a9f00 100644 --- a/bigframes/testing/polars_session.py +++ b/bigframes/testing/polars_session.py @@ -46,7 +46,7 @@ def peek( # Currently, pyarrow types might not quite be exactly the ones in the bigframes schema. # Nullability may be different, and might use large versions of list, string datatypes. return bigframes.session.executor.ExecuteResult( - arrow_batches=pa_table.to_batches(), + _arrow_batches=pa_table.to_batches(), schema=array_value.schema, total_bytes=pa_table.nbytes, total_rows=pa_table.num_rows, @@ -69,7 +69,7 @@ def execute( # Currently, pyarrow types might not quite be exactly the ones in the bigframes schema. # Nullability may be different, and might use large versions of list, string datatypes. return bigframes.session.executor.ExecuteResult( - arrow_batches=pa_table.to_batches(), + _arrow_batches=pa_table.to_batches(), schema=array_value.schema, total_bytes=pa_table.nbytes, total_rows=pa_table.num_rows, diff --git a/bigframes/version.py b/bigframes/version.py index 138c380d0c..5d2de2f97f 100644 --- a/bigframes/version.py +++ b/bigframes/version.py @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "2.7.0" +__version__ = "2.8.0" # {x-release-please-start-date} -__release_date__ = "2025-06-16" +__release_date__ = "2025-06-23" # {x-release-please-end} diff --git a/mypy.ini b/mypy.ini index fe1d3bc9c6..7709eb200a 100644 --- a/mypy.ini +++ b/mypy.ini @@ -41,3 +41,6 @@ ignore_missing_imports = True [mypy-google.cloud.bigtable] ignore_missing_imports = True + +[mypy-anywidget] +ignore_missing_imports = True diff --git a/notebooks/dataframes/anywidget_mode.ipynb b/notebooks/dataframes/anywidget_mode.ipynb new file mode 100644 index 0000000000..c54f52da59 --- /dev/null +++ b/notebooks/dataframes/anywidget_mode.ipynb @@ -0,0 +1,149 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "d10bfca4", + "metadata": {}, + "outputs": [], + "source": [ + "# Copyright 2025 Google LLC\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "markdown", + "id": "acca43ae", + "metadata": {}, + "source": [ + "# Demo to Show Anywidget mode" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "ca22f059", + "metadata": {}, + "outputs": [], + "source": [ + "import bigframes.pandas as bpd" + ] + }, + { + "cell_type": "markdown", + "id": "04406a4d", + "metadata": {}, + "source": [ + "Set the display option to use anywidget" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "1bc5aaf3", + "metadata": {}, + "outputs": [], + "source": [ + "bpd.options.display.repr_mode = \"anywidget\"" + ] + }, + { + "cell_type": "markdown", + "id": "0a354c69", + "metadata": {}, + "source": [ + "Display the dataframe in anywidget mode" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "f289d250", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job 91997f19-1768-4360-afa7-4a431b3e2d22 is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Computation deferred. Computation will process 171.4 MB\n" + ] + } + ], + "source": [ + "df = bpd.read_gbq(\"bigquery-public-data.usa_names.usa_1910_2013\")\n", + "print(df)" + ] + }, + { + "cell_type": "markdown", + "id": "3a73e472", + "metadata": {}, + "source": [ + "Display Series in anywidget mode" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "42bb02ab", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Computation deferred. Computation will process 171.4 MB\n" + ] + } + ], + "source": [ + "test_series = df[\"year\"]\n", + "print(test_series)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.15" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/generative_ai/bq_dataframes_ai_forecast.ipynb b/notebooks/generative_ai/bq_dataframes_ai_forecast.ipynb new file mode 100644 index 0000000000..05e75b37f0 --- /dev/null +++ b/notebooks/generative_ai/bq_dataframes_ai_forecast.ipynb @@ -0,0 +1,1653 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# Copyright 2025 Google LLC\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# BigFrames AI Forecast\n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " \"Colab Run in Colab\n", + " \n", + " \n", + " \n", + " \"GitHub\n", + " View on GitHub\n", + " \n", + " \n", + " \n", + " \"BQ\n", + " Open in BQ Studio\n", + " \n", + "
\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This Notebook introduces forecasting with GenAI Fundation Model with BigFrames AI." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Setup" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "PROJECT = \"bigframes-dev\" # replace with your project\n", + "\n", + "import bigframes\n", + "# Setup project\n", + "bigframes.options.bigquery.project = PROJECT\n", + "bigframes.options.display.progress_bar = None\n", + "\n", + "import bigframes.pandas as bpd" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1. Create a BigFrames DataFrames from BigQuery public data." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
trip_idduration_secstart_datestart_station_namestart_station_idend_dateend_station_nameend_station_idbike_numberzip_code...c_subscription_typestart_station_latitudestart_station_longitudeend_station_latitudeend_station_longitudemember_birth_yearmember_genderbike_share_for_all_tripstart_station_geomend_station_geom
013045315972016-08-05 10:55:00+00:00San Francisco Caltrain 2 (330 Townsend)692016-08-05 11:05:00+00:00Powell Street BART3921495121...Subscriber<NA><NA><NA><NA><NA><NA><NA>NoneNone
11848704032014-02-14 14:50:00+00:00Howard at 2nd632014-02-14 14:56:00+00:00Commercial at Montgomery4534294122...Subscriber<NA><NA><NA><NA><NA><NA><NA>NoneNone
220170702115603836166952017-07-02 11:56:03+00:00Union Square (Powell St at Post St)3242017-07-02 16:34:19+00:00Union Square (Powell St at Post St)324836<NA>...<NA>37.7883-122.40853137.7883-122.408531<NA><NA><NA>POINT (-122.40853 37.7883)POINT (-122.40853 37.7883)
310668109532016-01-21 08:24:00+00:00Civic Center BART (7th at Market)722016-01-21 08:40:00+00:00Embarcadero at Sansome6021294103...Subscriber<NA><NA><NA><NA><NA><NA><NA>NoneNone
42204816792014-03-19 19:20:00+00:00San Francisco Caltrain 2 (330 Townsend)692014-03-19 19:31:00+00:00Civic Center BART (7th at Market)7247894107...Subscriber<NA><NA><NA><NA><NA><NA><NA>NoneNone
57384743582015-04-23 16:45:00+00:002nd at Folsom622015-04-23 16:51:00+00:00Steuart at Market7444394105...Subscriber<NA><NA><NA><NA><NA><NA><NA>NoneNone
62292642862014-03-27 17:56:00+00:00Embarcadero at Sansome602014-03-27 18:01:00+00:00Davis at Jackson4234294133...Subscriber<NA><NA><NA><NA><NA><NA><NA>NoneNone
735201036212014-07-06 13:55:00+00:00Embarcadero at Sansome602014-07-06 14:55:00+00:00Embarcadero at Sansome603904038...Customer<NA><NA><NA><NA><NA><NA><NA>NoneNone
81562554162014-01-16 18:06:00+00:00Embarcadero at Bryant542014-01-16 18:13:00+00:00San Francisco Caltrain (Townsend at 4th)7051094107...Subscriber<NA><NA><NA><NA><NA><NA><NA>NoneNone
9104019710542015-12-15 18:05:00+00:00Steuart at Market742015-12-15 18:22:00+00:00San Francisco Caltrain (Townsend at 4th)7070094111...Subscriber<NA><NA><NA><NA><NA><NA><NA>NoneNone
1011526935622016-04-07 08:18:00+00:00San Francisco Caltrain (Townsend at 4th)702016-04-07 08:27:00+00:00Steuart at Market7441994158...Subscriber<NA><NA><NA><NA><NA><NA><NA>NoneNone
112018041917351834018872018-04-19 17:35:18+00:00Montgomery St BART Station (Market St at 2nd St)212018-04-19 17:50:06+00:00Civic Center/UN Plaza BART Station (Market St ...443401<NA>...<NA>37.789625-122.40081137.781074-122.4117381979MaleNoPOINT (-122.40081 37.78963)POINT (-122.41174 37.78107)
122092839432014-03-11 09:01:00+00:00South Van Ness at Market662014-03-11 09:16:00+00:00Temporary Transbay Terminal (Howard at Beale)5553294105...Subscriber<NA><NA><NA><NA><NA><NA><NA>NoneNone
132017082814043125303892017-08-28 14:04:31+00:0016th St at Prosper St1052017-08-28 14:11:00+00:00Mission Playground1212530<NA>...<NA>37.764285-122.43180437.75921-122.4213391981Male<NA>POINT (-122.4318 37.76428)POINT (-122.42134 37.75921)
14201711241151588413842017-11-24 11:51:58+00:002nd Ave at E 18th St2002017-11-24 11:58:23+00:00El Embarcadero at Grand Ave197841<NA>...<NA>37.800214-122.2538137.808848-122.249681977Female<NA>POINT (-122.25381 37.80021)POINT (-122.24968 37.80885)
1513210428742016-08-18 08:14:00+00:00San Francisco Caltrain (Townsend at 4th)702016-08-18 08:29:00+00:00Beale at Market5639095050...Subscriber<NA><NA><NA><NA><NA><NA><NA>NoneNone
1620171213132518312013762017-12-13 13:25:18+00:00Steuart St at Market St162017-12-13 13:48:14+00:00The Embarcadero at Sansome St63120<NA>...<NA>37.79413-122.3944337.80477-122.403234<NA><NA><NA>POINT (-122.39443 37.79413)POINT (-122.40323 37.80477)
172017083108271516462002017-08-31 08:27:15+00:00Powell St BART Station (Market St at 4th St)32017-08-31 08:30:36+00:00Montgomery St BART Station (Market St at 2nd St)211646<NA>...<NA>37.786375-122.40490437.789625-122.4008111988Male<NA>POINT (-122.4049 37.78638)POINT (-122.40081 37.78963)
1820180125175410290714902018-01-25 17:54:10+00:00Esprit Park1262018-01-25 18:19:01+00:00The Embarcadero at Vallejo St82907<NA>...<NA>37.761634-122.39064837.799953-122.3985251989FemaleNoPOINT (-122.39065 37.76163)POINT (-122.39852 37.79995)
192017092309513022223192017-09-23 09:51:30+00:007th St at Brannan St792017-09-23 09:56:49+00:00San Francisco Caltrain (Townsend St at 4th St)302222<NA>...<NA>37.773492-122.40367237.776598-122.3952821975Male<NA>POINT (-122.40367 37.77349)POINT (-122.39528 37.7766)
202018022017281541540092018-02-20 17:28:15+00:00Franklin St at 9th St1622018-02-20 18:35:05+00:00Telegraph Ave at 27th St179415<NA>...<NA>37.800516-122.2720837.816073-122.2678861973MaleYesPOINT (-122.27208 37.80052)POINT (-122.26789 37.81607)
212017101917144430036912017-10-19 17:14:44+00:00Harrison St at 20th St1292017-10-19 17:26:16+00:00Valencia St at 22nd St1333003<NA>...<NA>37.758862-122.41254437.755213-122.4209751958Male<NA>POINT (-122.41254 37.75886)POINT (-122.42098 37.75521)
225951464532015-01-07 18:34:00+00:00Market at 10th672015-01-07 18:42:00+00:00Townsend at 7th6542195014...Subscriber<NA><NA><NA><NA><NA><NA><NA>NoneNone
232017082909135024547882017-08-29 09:13:50+00:00San Francisco Caltrain (Townsend St at 4th St)302017-08-29 09:26:58+00:00The Embarcadero at Vallejo St82454<NA>...<NA>37.776598-122.39528237.799953-122.3985251979Male<NA>POINT (-122.39528 37.7766)POINT (-122.39852 37.79995)
242017122711504330361502017-12-27 11:50:43+00:00Powell St BART Station (Market St at 4th St)32017-12-27 11:53:14+00:004th St at Harrison St473036<NA>...<NA>37.786375-122.40490437.780955-122.3997491989Male<NA>POINT (-122.4049 37.78638)POINT (-122.39975 37.78095)
\n", + "

25 rows × 21 columns

\n", + "
[1947417 rows x 21 columns in total]" + ], + "text/plain": [ + " trip_id duration_sec start_date \\\n", + "0 1304531 597 2016-08-05 10:55:00+00:00 \n", + "1 184870 403 2014-02-14 14:50:00+00:00 \n", + "2 20170702115603836 16695 2017-07-02 11:56:03+00:00 \n", + "3 1066810 953 2016-01-21 08:24:00+00:00 \n", + "4 220481 679 2014-03-19 19:20:00+00:00 \n", + "5 738474 358 2015-04-23 16:45:00+00:00 \n", + "6 229264 286 2014-03-27 17:56:00+00:00 \n", + "7 352010 3621 2014-07-06 13:55:00+00:00 \n", + "8 156255 416 2014-01-16 18:06:00+00:00 \n", + "9 1040197 1054 2015-12-15 18:05:00+00:00 \n", + "10 1152693 562 2016-04-07 08:18:00+00:00 \n", + "11 201804191735183401 887 2018-04-19 17:35:18+00:00 \n", + "12 209283 943 2014-03-11 09:01:00+00:00 \n", + "13 201708281404312530 389 2017-08-28 14:04:31+00:00 \n", + "14 20171124115158841 384 2017-11-24 11:51:58+00:00 \n", + "15 1321042 874 2016-08-18 08:14:00+00:00 \n", + "16 201712131325183120 1376 2017-12-13 13:25:18+00:00 \n", + "17 201708310827151646 200 2017-08-31 08:27:15+00:00 \n", + "18 201801251754102907 1490 2018-01-25 17:54:10+00:00 \n", + "19 201709230951302222 319 2017-09-23 09:51:30+00:00 \n", + "20 20180220172815415 4009 2018-02-20 17:28:15+00:00 \n", + "21 201710191714443003 691 2017-10-19 17:14:44+00:00 \n", + "22 595146 453 2015-01-07 18:34:00+00:00 \n", + "23 201708290913502454 788 2017-08-29 09:13:50+00:00 \n", + "24 201712271150433036 150 2017-12-27 11:50:43+00:00 \n", + "\n", + " start_station_name start_station_id \\\n", + "0 San Francisco Caltrain 2 (330 Townsend) 69 \n", + "1 Howard at 2nd 63 \n", + "2 Union Square (Powell St at Post St) 324 \n", + "3 Civic Center BART (7th at Market) 72 \n", + "4 San Francisco Caltrain 2 (330 Townsend) 69 \n", + "5 2nd at Folsom 62 \n", + "6 Embarcadero at Sansome 60 \n", + "7 Embarcadero at Sansome 60 \n", + "8 Embarcadero at Bryant 54 \n", + "9 Steuart at Market 74 \n", + "10 San Francisco Caltrain (Townsend at 4th) 70 \n", + "11 Montgomery St BART Station (Market St at 2nd St) 21 \n", + "12 South Van Ness at Market 66 \n", + "13 16th St at Prosper St 105 \n", + "14 2nd Ave at E 18th St 200 \n", + "15 San Francisco Caltrain (Townsend at 4th) 70 \n", + "16 Steuart St at Market St 16 \n", + "17 Powell St BART Station (Market St at 4th St) 3 \n", + "18 Esprit Park 126 \n", + "19 7th St at Brannan St 79 \n", + "20 Franklin St at 9th St 162 \n", + "21 Harrison St at 20th St 129 \n", + "22 Market at 10th 67 \n", + "23 San Francisco Caltrain (Townsend St at 4th St) 30 \n", + "24 Powell St BART Station (Market St at 4th St) 3 \n", + "\n", + " end_date \\\n", + "0 2016-08-05 11:05:00+00:00 \n", + "1 2014-02-14 14:56:00+00:00 \n", + "2 2017-07-02 16:34:19+00:00 \n", + "3 2016-01-21 08:40:00+00:00 \n", + "4 2014-03-19 19:31:00+00:00 \n", + "5 2015-04-23 16:51:00+00:00 \n", + "6 2014-03-27 18:01:00+00:00 \n", + "7 2014-07-06 14:55:00+00:00 \n", + "8 2014-01-16 18:13:00+00:00 \n", + "9 2015-12-15 18:22:00+00:00 \n", + "10 2016-04-07 08:27:00+00:00 \n", + "11 2018-04-19 17:50:06+00:00 \n", + "12 2014-03-11 09:16:00+00:00 \n", + "13 2017-08-28 14:11:00+00:00 \n", + "14 2017-11-24 11:58:23+00:00 \n", + "15 2016-08-18 08:29:00+00:00 \n", + "16 2017-12-13 13:48:14+00:00 \n", + "17 2017-08-31 08:30:36+00:00 \n", + "18 2018-01-25 18:19:01+00:00 \n", + "19 2017-09-23 09:56:49+00:00 \n", + "20 2018-02-20 18:35:05+00:00 \n", + "21 2017-10-19 17:26:16+00:00 \n", + "22 2015-01-07 18:42:00+00:00 \n", + "23 2017-08-29 09:26:58+00:00 \n", + "24 2017-12-27 11:53:14+00:00 \n", + "\n", + " end_station_name end_station_id \\\n", + "0 Powell Street BART 39 \n", + "1 Commercial at Montgomery 45 \n", + "2 Union Square (Powell St at Post St) 324 \n", + "3 Embarcadero at Sansome 60 \n", + "4 Civic Center BART (7th at Market) 72 \n", + "5 Steuart at Market 74 \n", + "6 Davis at Jackson 42 \n", + "7 Embarcadero at Sansome 60 \n", + "8 San Francisco Caltrain (Townsend at 4th) 70 \n", + "9 San Francisco Caltrain (Townsend at 4th) 70 \n", + "10 Steuart at Market 74 \n", + "11 Civic Center/UN Plaza BART Station (Market St ... 44 \n", + "12 Temporary Transbay Terminal (Howard at Beale) 55 \n", + "13 Mission Playground 121 \n", + "14 El Embarcadero at Grand Ave 197 \n", + "15 Beale at Market 56 \n", + "16 The Embarcadero at Sansome St 6 \n", + "17 Montgomery St BART Station (Market St at 2nd St) 21 \n", + "18 The Embarcadero at Vallejo St 8 \n", + "19 San Francisco Caltrain (Townsend St at 4th St) 30 \n", + "20 Telegraph Ave at 27th St 179 \n", + "21 Valencia St at 22nd St 133 \n", + "22 Townsend at 7th 65 \n", + "23 The Embarcadero at Vallejo St 8 \n", + "24 4th St at Harrison St 47 \n", + "\n", + " bike_number zip_code ... c_subscription_type start_station_latitude \\\n", + "0 214 95121 ... Subscriber \n", + "1 342 94122 ... Subscriber \n", + "2 836 ... 37.7883 \n", + "3 212 94103 ... Subscriber \n", + "4 478 94107 ... Subscriber \n", + "5 443 94105 ... Subscriber \n", + "6 342 94133 ... Subscriber \n", + "7 390 4038 ... Customer \n", + "8 510 94107 ... Subscriber \n", + "9 700 94111 ... Subscriber \n", + "10 419 94158 ... Subscriber \n", + "11 3401 ... 37.789625 \n", + "12 532 94105 ... Subscriber \n", + "13 2530 ... 37.764285 \n", + "14 841 ... 37.800214 \n", + "15 390 95050 ... Subscriber \n", + "16 3120 ... 37.79413 \n", + "17 1646 ... 37.786375 \n", + "18 2907 ... 37.761634 \n", + "19 2222 ... 37.773492 \n", + "20 415 ... 37.800516 \n", + "21 3003 ... 37.758862 \n", + "22 421 95014 ... Subscriber \n", + "23 2454 ... 37.776598 \n", + "24 3036 ... 37.786375 \n", + "\n", + " start_station_longitude end_station_latitude end_station_longitude \\\n", + "0 \n", + "1 \n", + "2 -122.408531 37.7883 -122.408531 \n", + "3 \n", + "4 \n", + "5 \n", + "6 \n", + "7 \n", + "8 \n", + "9 \n", + "10 \n", + "11 -122.400811 37.781074 -122.411738 \n", + "12 \n", + "13 -122.431804 37.75921 -122.421339 \n", + "14 -122.25381 37.808848 -122.24968 \n", + "15 \n", + "16 -122.39443 37.80477 -122.403234 \n", + "17 -122.404904 37.789625 -122.400811 \n", + "18 -122.390648 37.799953 -122.398525 \n", + "19 -122.403672 37.776598 -122.395282 \n", + "20 -122.27208 37.816073 -122.267886 \n", + "21 -122.412544 37.755213 -122.420975 \n", + "22 \n", + "23 -122.395282 37.799953 -122.398525 \n", + "24 -122.404904 37.780955 -122.399749 \n", + "\n", + " member_birth_year member_gender bike_share_for_all_trip \\\n", + "0 \n", + "1 \n", + "2 \n", + "3 \n", + "4 \n", + "5 \n", + "6 \n", + "7 \n", + "8 \n", + "9 \n", + "10 \n", + "11 1979 Male No \n", + "12 \n", + "13 1981 Male \n", + "14 1977 Female \n", + "15 \n", + "16 \n", + "17 1988 Male \n", + "18 1989 Female No \n", + "19 1975 Male \n", + "20 1973 Male Yes \n", + "21 1958 Male \n", + "22 \n", + "23 1979 Male \n", + "24 1989 Male \n", + "\n", + " start_station_geom end_station_geom \n", + "0 None None \n", + "1 None None \n", + "2 POINT (-122.40853 37.7883) POINT (-122.40853 37.7883) \n", + "3 None None \n", + "4 None None \n", + "5 None None \n", + "6 None None \n", + "7 None None \n", + "8 None None \n", + "9 None None \n", + "10 None None \n", + "11 POINT (-122.40081 37.78963) POINT (-122.41174 37.78107) \n", + "12 None None \n", + "13 POINT (-122.4318 37.76428) POINT (-122.42134 37.75921) \n", + "14 POINT (-122.25381 37.80021) POINT (-122.24968 37.80885) \n", + "15 None None \n", + "16 POINT (-122.39443 37.79413) POINT (-122.40323 37.80477) \n", + "17 POINT (-122.4049 37.78638) POINT (-122.40081 37.78963) \n", + "18 POINT (-122.39065 37.76163) POINT (-122.39852 37.79995) \n", + "19 POINT (-122.40367 37.77349) POINT (-122.39528 37.7766) \n", + "20 POINT (-122.27208 37.80052) POINT (-122.26789 37.81607) \n", + "21 POINT (-122.41254 37.75886) POINT (-122.42098 37.75521) \n", + "22 None None \n", + "23 POINT (-122.39528 37.7766) POINT (-122.39852 37.79995) \n", + "24 POINT (-122.4049 37.78638) POINT (-122.39975 37.78095) \n", + "...\n", + "\n", + "[1947417 rows x 21 columns]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = bpd.read_gbq(\"bigquery-public-data.san_francisco_bikeshare.bikeshare_trips\")\n", + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2. Preprocess Data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Only take the start_date after 2018 and the \"Subscriber\" category as input. start_date are truncated to each hour." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "df = df[df[\"start_date\"] >= \"2018-01-01\"]\n", + "df = df[df[\"subscriber_type\"] == \"Subscriber\"]\n", + "df[\"trip_hour\"] = df[\"start_date\"].dt.floor(\"h\")\n", + "df = df[[\"trip_hour\", \"trip_id\"]]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Group and count each hour's num of trips." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
trip_hournum_trips
02018-01-01 00:00:00+00:0020
12018-01-01 01:00:00+00:0025
22018-01-01 02:00:00+00:0013
32018-01-01 03:00:00+00:0011
42018-01-01 05:00:00+00:004
52018-01-01 06:00:00+00:008
62018-01-01 07:00:00+00:008
72018-01-01 08:00:00+00:0020
82018-01-01 09:00:00+00:0030
92018-01-01 10:00:00+00:0041
102018-01-01 11:00:00+00:0045
112018-01-01 12:00:00+00:0054
122018-01-01 13:00:00+00:0057
132018-01-01 14:00:00+00:0068
142018-01-01 15:00:00+00:0086
152018-01-01 16:00:00+00:0072
162018-01-01 17:00:00+00:0072
172018-01-01 18:00:00+00:0047
182018-01-01 19:00:00+00:0032
192018-01-01 20:00:00+00:0034
202018-01-01 21:00:00+00:0027
212018-01-01 22:00:00+00:0015
222018-01-01 23:00:00+00:006
232018-01-02 00:00:00+00:002
242018-01-02 01:00:00+00:001
\n", + "

25 rows × 2 columns

\n", + "
[2842 rows x 2 columns in total]" + ], + "text/plain": [ + " trip_hour num_trips\n", + "0 2018-01-01 00:00:00+00:00 20\n", + "1 2018-01-01 01:00:00+00:00 25\n", + "2 2018-01-01 02:00:00+00:00 13\n", + "3 2018-01-01 03:00:00+00:00 11\n", + "4 2018-01-01 05:00:00+00:00 4\n", + "5 2018-01-01 06:00:00+00:00 8\n", + "6 2018-01-01 07:00:00+00:00 8\n", + "7 2018-01-01 08:00:00+00:00 20\n", + "8 2018-01-01 09:00:00+00:00 30\n", + "9 2018-01-01 10:00:00+00:00 41\n", + "10 2018-01-01 11:00:00+00:00 45\n", + "11 2018-01-01 12:00:00+00:00 54\n", + "12 2018-01-01 13:00:00+00:00 57\n", + "13 2018-01-01 14:00:00+00:00 68\n", + "14 2018-01-01 15:00:00+00:00 86\n", + "15 2018-01-01 16:00:00+00:00 72\n", + "16 2018-01-01 17:00:00+00:00 72\n", + "17 2018-01-01 18:00:00+00:00 47\n", + "18 2018-01-01 19:00:00+00:00 32\n", + "19 2018-01-01 20:00:00+00:00 34\n", + "20 2018-01-01 21:00:00+00:00 27\n", + "21 2018-01-01 22:00:00+00:00 15\n", + "22 2018-01-01 23:00:00+00:00 6\n", + "23 2018-01-02 00:00:00+00:00 2\n", + "24 2018-01-02 01:00:00+00:00 1\n", + "...\n", + "\n", + "[2842 rows x 2 columns]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_grouped = df.groupby(\"trip_hour\").count()\n", + "df_grouped = df_grouped.reset_index().rename(columns={\"trip_id\": \"num_trips\"})\n", + "df_grouped" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3. Make forecastings for next 1 week with DataFrames.ai.forecast API" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
forecast_timestampforecast_valueconfidence_levelprediction_interval_lower_boundprediction_interval_upper_boundai_forecast_status
02018-05-05 01:00:00+00:0050.1236720.95-13.062586113.309931
12018-05-05 07:00:00+00:00103.1128460.9533.725954172.499739
22018-05-03 15:00:00+00:00230.491470.95152.635986308.346954
32018-05-02 08:00:00+00:00737.4773560.95562.979208911.975504
42018-05-01 08:00:00+00:00679.9804690.95479.980134879.980803
52018-05-06 18:00:00+00:00136.808350.95-13.813863287.430562
62018-05-01 11:00:00+00:00120.3642880.9552.778249187.950328
72018-05-06 22:00:00+00:0064.7224430.95-55.555842185.000727
82018-05-03 02:00:00+00:0042.6898040.9533.25841452.121194
92018-05-07 17:00:00+00:00594.9990840.95346.917217843.080952
102018-05-03 20:00:00+00:00161.8222810.95100.005942223.63862
112018-05-01 20:00:00+00:00173.8010250.9556.460376291.141675
122018-05-04 17:00:00+00:00485.4498290.95356.038539614.86112
132018-05-04 09:00:00+00:00418.0558780.95281.134736554.977019
142018-05-07 03:00:00+00:0024.7351340.95-100.607727150.077995
152018-05-05 11:00:00+00:00186.081360.95140.706789231.455931
162018-05-03 08:00:00+00:00675.3802490.95532.913707817.846791
172018-05-02 09:00:00+00:00537.4948120.95376.406922698.582702
182018-05-01 12:00:00+00:00101.6371690.9555.141509148.132829
192018-05-05 00:00:00+00:007.4697720.95-23.93039238.869936
202018-05-02 14:00:00+00:00153.8513790.95104.224826203.477932
212018-05-04 13:00:00+00:00162.6761170.95113.098327212.253907
222018-05-04 16:00:00+00:00330.6434020.95205.125168456.161636
232018-05-04 21:00:00+00:00136.2646790.9541.947438230.58192
242018-05-02 17:00:00+00:00675.5272220.95516.358698834.695746
\n", + "

25 rows × 6 columns

\n", + "
[168 rows x 6 columns in total]" + ], + "text/plain": [ + " forecast_timestamp forecast_value confidence_level \\\n", + "0 2018-05-05 01:00:00+00:00 50.123672 0.95 \n", + "1 2018-05-05 07:00:00+00:00 103.112846 0.95 \n", + "2 2018-05-03 15:00:00+00:00 230.49147 0.95 \n", + "3 2018-05-02 08:00:00+00:00 737.477356 0.95 \n", + "4 2018-05-01 08:00:00+00:00 679.980469 0.95 \n", + "5 2018-05-06 18:00:00+00:00 136.80835 0.95 \n", + "6 2018-05-01 11:00:00+00:00 120.364288 0.95 \n", + "7 2018-05-06 22:00:00+00:00 64.722443 0.95 \n", + "8 2018-05-03 02:00:00+00:00 42.689804 0.95 \n", + "9 2018-05-07 17:00:00+00:00 594.999084 0.95 \n", + "10 2018-05-03 20:00:00+00:00 161.822281 0.95 \n", + "11 2018-05-01 20:00:00+00:00 173.801025 0.95 \n", + "12 2018-05-04 17:00:00+00:00 485.449829 0.95 \n", + "13 2018-05-04 09:00:00+00:00 418.055878 0.95 \n", + "14 2018-05-07 03:00:00+00:00 24.735134 0.95 \n", + "15 2018-05-05 11:00:00+00:00 186.08136 0.95 \n", + "16 2018-05-03 08:00:00+00:00 675.380249 0.95 \n", + "17 2018-05-02 09:00:00+00:00 537.494812 0.95 \n", + "18 2018-05-01 12:00:00+00:00 101.637169 0.95 \n", + "19 2018-05-05 00:00:00+00:00 7.469772 0.95 \n", + "20 2018-05-02 14:00:00+00:00 153.851379 0.95 \n", + "21 2018-05-04 13:00:00+00:00 162.676117 0.95 \n", + "22 2018-05-04 16:00:00+00:00 330.643402 0.95 \n", + "23 2018-05-04 21:00:00+00:00 136.264679 0.95 \n", + "24 2018-05-02 17:00:00+00:00 675.527222 0.95 \n", + "\n", + " prediction_interval_lower_bound prediction_interval_upper_bound \\\n", + "0 -13.062586 113.309931 \n", + "1 33.725954 172.499739 \n", + "2 152.635986 308.346954 \n", + "3 562.979208 911.975504 \n", + "4 479.980134 879.980803 \n", + "5 -13.813863 287.430562 \n", + "6 52.778249 187.950328 \n", + "7 -55.555842 185.000727 \n", + "8 33.258414 52.121194 \n", + "9 346.917217 843.080952 \n", + "10 100.005942 223.63862 \n", + "11 56.460376 291.141675 \n", + "12 356.038539 614.86112 \n", + "13 281.134736 554.977019 \n", + "14 -100.607727 150.077995 \n", + "15 140.706789 231.455931 \n", + "16 532.913707 817.846791 \n", + "17 376.406922 698.582702 \n", + "18 55.141509 148.132829 \n", + "19 -23.930392 38.869936 \n", + "20 104.224826 203.477932 \n", + "21 113.098327 212.253907 \n", + "22 205.125168 456.161636 \n", + "23 41.947438 230.58192 \n", + "24 516.358698 834.695746 \n", + "\n", + " ai_forecast_status \n", + "0 \n", + "1 \n", + "2 \n", + "3 \n", + "4 \n", + "5 \n", + "6 \n", + "7 \n", + "8 \n", + "9 \n", + "10 \n", + "11 \n", + "12 \n", + "13 \n", + "14 \n", + "15 \n", + "16 \n", + "17 \n", + "18 \n", + "19 \n", + "20 \n", + "21 \n", + "22 \n", + "23 \n", + "24 \n", + "...\n", + "\n", + "[168 rows x 6 columns]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "result = df_grouped.ai.forecast(timestamp_column=\"trip_hour\", data_column=\"num_trips\", horizon=168) # 1 week\n", + "result" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 4. Process the raw result and draw a line plot along with the training data" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "result = result.sort_values(\"forecast_timestamp\")\n", + "result = result[[\"forecast_timestamp\", \"forecast_value\"]]\n", + "result = result.rename(columns={\"forecast_timestamp\": \"trip_hour\", \"forecast_value\": \"num_trips_forecast\"})\n", + "df_all = bpd.concat([df_grouped, result])\n", + "df_all = df_all.tail(672) # 4 weeks" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "df_all = df_all.set_index(\"trip_hour\")\n", + "df_all.plot.line(figsize=(16, 8))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/multimodal/multimodal_dataframe.ipynb b/notebooks/multimodal/multimodal_dataframe.ipynb index 3f36c2908a..fbe074b0d0 100644 --- a/notebooks/multimodal/multimodal_dataframe.ipynb +++ b/notebooks/multimodal/multimodal_dataframe.ipynb @@ -254,16 +254,17 @@ "outputs": [], "source": [ "df_image[\"blurred\"] = df_image[\"image\"].blob.image_blur(\n", - " (20, 20), dst=f\"gs://{OUTPUT_BUCKET}/image_blur_transformed/\"\n", + " (20, 20), dst=f\"gs://{OUTPUT_BUCKET}/image_blur_transformed/\", engine=\"opencv\"\n", ")\n", "df_image[\"resized\"] = df_image[\"image\"].blob.image_resize(\n", - " (300, 200), dst=f\"gs://{OUTPUT_BUCKET}/image_resize_transformed/\"\n", + " (300, 200), dst=f\"gs://{OUTPUT_BUCKET}/image_resize_transformed/\", engine=\"opencv\"\n", ")\n", "df_image[\"normalized\"] = df_image[\"image\"].blob.image_normalize(\n", " alpha=50.0,\n", " beta=150.0,\n", " norm_type=\"minmax\",\n", " dst=f\"gs://{OUTPUT_BUCKET}/image_normalize_transformed/\",\n", + " engine=\"opencv\",\n", ")" ] }, @@ -280,7 +281,7 @@ "outputs": [], "source": [ "# You can also chain functions together\n", - "df_image[\"blur_resized\"] = df_image[\"blurred\"].blob.image_resize((300, 200), dst=f\"gs://{OUTPUT_BUCKET}/image_blur_resize_transformed/\")" + "df_image[\"blur_resized\"] = df_image[\"blurred\"].blob.image_resize((300, 200), dst=f\"gs://{OUTPUT_BUCKET}/image_blur_resize_transformed/\", engine=\"opencv\")" ] }, { @@ -419,7 +420,7 @@ }, "outputs": [], "source": [ - "df_pdf[\"chunked\"] = df_pdf[\"pdf\"].blob.pdf_chunk()" + "df_pdf[\"chunked\"] = df_pdf[\"pdf\"].blob.pdf_chunk(engine=\"pypdf\")" ] }, { diff --git a/noxfile.py b/noxfile.py index dee5f929b7..a1e8e5b99b 100644 --- a/noxfile.py +++ b/noxfile.py @@ -77,9 +77,9 @@ ] UNIT_TEST_LOCAL_DEPENDENCIES: List[str] = [] UNIT_TEST_DEPENDENCIES: List[str] = [] -UNIT_TEST_EXTRAS: List[str] = ["tests"] +UNIT_TEST_EXTRAS: List[str] = ["tests", "anywidget"] UNIT_TEST_EXTRAS_BY_PYTHON: Dict[str, List[str]] = { - "3.12": ["tests", "polars", "scikit-learn"], + "3.12": ["tests", "polars", "scikit-learn", "anywidget"], } # 3.10 is needed for Windows tests as it is the only version installed in the @@ -106,9 +106,9 @@ SYSTEM_TEST_DEPENDENCIES: List[str] = [] SYSTEM_TEST_EXTRAS: List[str] = [] SYSTEM_TEST_EXTRAS_BY_PYTHON: Dict[str, List[str]] = { - "3.9": ["tests"], + "3.9": ["tests", "anywidget"], "3.10": ["tests"], - "3.12": ["tests", "scikit-learn", "polars"], + "3.12": ["tests", "scikit-learn", "polars", "anywidget"], "3.13": ["tests", "polars"], } @@ -276,6 +276,7 @@ def mypy(session): "types-setuptools", "types-tabulate", "polars", + "anywidget", ] ) | set(SYSTEM_TEST_STANDARD_DEPENDENCIES) @@ -518,6 +519,7 @@ def docs(session): SPHINX_VERSION, "alabaster", "recommonmark", + "anywidget", ) shutil.rmtree(os.path.join("docs", "_build"), ignore_errors=True) @@ -560,6 +562,7 @@ def docfx(session): "alabaster", "recommonmark", "gcp-sphinx-docfx-yaml==3.0.1", + "anywidget", ) shutil.rmtree(os.path.join("docs", "_build"), ignore_errors=True) @@ -763,6 +766,7 @@ def notebook(session: nox.Session): "google-cloud-aiplatform", "matplotlib", "seaborn", + "anywidget", ) notebooks_list = list(pathlib.Path("notebooks/").glob("*/*.ipynb")) @@ -805,6 +809,9 @@ def notebook(session: nox.Session): # continuously tested. "notebooks/apps/synthetic_data_generation.ipynb", "notebooks/multimodal/multimodal_dataframe.ipynb", # too slow + # This anywidget notebook uses deferred execution, so it won't + # produce metrics for the performance benchmark script. + "notebooks/dataframes/anywidget_mode.ipynb", ] # TODO: remove exception for Python 3.13 cloud run adds a runtime for it (internal issue 333742751) diff --git a/samples/snippets/multimodal_test.py b/samples/snippets/multimodal_test.py index 7f8e13cd7b..087299aa0a 100644 --- a/samples/snippets/multimodal_test.py +++ b/samples/snippets/multimodal_test.py @@ -56,21 +56,22 @@ def test_multimodal_dataframe(gcs_dst_bucket: str) -> None: # [START bigquery_dataframes_multimodal_dataframe_image_transform] df_image["blurred"] = df_image["image"].blob.image_blur( - (20, 20), dst=f"{dst_bucket}/image_blur_transformed/" + (20, 20), dst=f"{dst_bucket}/image_blur_transformed/", engine="opencv" ) df_image["resized"] = df_image["image"].blob.image_resize( - (300, 200), dst=f"{dst_bucket}/image_resize_transformed/" + (300, 200), dst=f"{dst_bucket}/image_resize_transformed/", engine="opencv" ) df_image["normalized"] = df_image["image"].blob.image_normalize( alpha=50.0, beta=150.0, norm_type="minmax", dst=f"{dst_bucket}/image_normalize_transformed/", + engine="opencv", ) # You can also chain functions together df_image["blur_resized"] = df_image["blurred"].blob.image_resize( - (300, 200), dst=f"{dst_bucket}/image_blur_resize_transformed/" + (300, 200), dst=f"{dst_bucket}/image_blur_resize_transformed/", engine="opencv" ) df_image # [END bigquery_dataframes_multimodal_dataframe_image_transform] @@ -113,7 +114,7 @@ def test_multimodal_dataframe(gcs_dst_bucket: str) -> None: df_pdf = bpd.from_glob_path( "gs://cloud-samples-data/bigquery/tutorials/cymbal-pets/documents/*", name="pdf" ) - df_pdf["chunked"] = df_pdf["pdf"].blob.pdf_chunk() + df_pdf["chunked"] = df_pdf["pdf"].blob.pdf_chunk(engine="pypdf") chunked = df_pdf["chunked"].explode() chunked # [END bigquery_dataframes_multimodal_dataframe_pdf_chunk] diff --git a/setup.py b/setup.py index ff40d29a16..ce0375527d 100644 --- a/setup.py +++ b/setup.py @@ -86,6 +86,10 @@ "nox", "google-cloud-testutils", ], + # install anywidget for SQL + "anywidget": [ + "anywidget>=0.9.18", + ], } extras["all"] = list(sorted(frozenset(itertools.chain.from_iterable(extras.values())))) diff --git a/tests/system/large/blob/test_function.py b/tests/system/large/blob/test_function.py index 4a95e4c6d1..a594b144f5 100644 --- a/tests/system/large/blob/test_function.py +++ b/tests/system/large/blob/test_function.py @@ -61,7 +61,9 @@ def test_blob_exif( connection=bq_connection, ) - actual = exif_image_df["blob_col"].blob.exif(connection=bq_connection) + actual = exif_image_df["blob_col"].blob.exif( + engine="pillow", connection=bq_connection + ) expected = bpd.Series( ['{"ExifOffset": 47, "Make": "MyCamera"}'], session=session, @@ -86,7 +88,7 @@ def test_blob_image_blur_to_series( ) actual = images_mm_df["blob_col"].blob.image_blur( - (8, 8), dst=series, connection=bq_connection + (8, 8), dst=series, connection=bq_connection, engine="opencv" ) expected_df = pd.DataFrame( { @@ -114,7 +116,7 @@ def test_blob_image_blur_to_folder( images_output_uris: list[str], ): actual = images_mm_df["blob_col"].blob.image_blur( - (8, 8), dst=images_output_folder, connection=bq_connection + (8, 8), dst=images_output_folder, connection=bq_connection, engine="opencv" ) expected_df = pd.DataFrame( { @@ -136,7 +138,9 @@ def test_blob_image_blur_to_folder( def test_blob_image_blur_to_bq(images_mm_df: bpd.DataFrame, bq_connection: str): - actual = images_mm_df["blob_col"].blob.image_blur((8, 8), connection=bq_connection) + actual = images_mm_df["blob_col"].blob.image_blur( + (8, 8), connection=bq_connection, engine="opencv" + ) assert isinstance(actual, bpd.Series) assert len(actual) == 2 @@ -154,7 +158,7 @@ def test_blob_image_resize_to_series( ) actual = images_mm_df["blob_col"].blob.image_resize( - (200, 300), dst=series, connection=bq_connection + (200, 300), dst=series, connection=bq_connection, engine="opencv" ) expected_df = pd.DataFrame( { @@ -182,7 +186,7 @@ def test_blob_image_resize_to_folder( images_output_uris: list[str], ): actual = images_mm_df["blob_col"].blob.image_resize( - (200, 300), dst=images_output_folder, connection=bq_connection + (200, 300), dst=images_output_folder, connection=bq_connection, engine="opencv" ) expected_df = pd.DataFrame( { @@ -205,7 +209,7 @@ def test_blob_image_resize_to_folder( def test_blob_image_resize_to_bq(images_mm_df: bpd.DataFrame, bq_connection: str): actual = images_mm_df["blob_col"].blob.image_resize( - (200, 300), connection=bq_connection + (200, 300), connection=bq_connection, engine="opencv" ) assert isinstance(actual, bpd.Series) @@ -224,7 +228,12 @@ def test_blob_image_normalize_to_series( ) actual = images_mm_df["blob_col"].blob.image_normalize( - alpha=50.0, beta=150.0, norm_type="minmax", dst=series, connection=bq_connection + alpha=50.0, + beta=150.0, + norm_type="minmax", + dst=series, + connection=bq_connection, + engine="opencv", ) expected_df = pd.DataFrame( { @@ -257,6 +266,7 @@ def test_blob_image_normalize_to_folder( norm_type="minmax", dst=images_output_folder, connection=bq_connection, + engine="opencv", ) expected_df = pd.DataFrame( { @@ -279,7 +289,11 @@ def test_blob_image_normalize_to_folder( def test_blob_image_normalize_to_bq(images_mm_df: bpd.DataFrame, bq_connection: str): actual = images_mm_df["blob_col"].blob.image_normalize( - alpha=50.0, beta=150.0, norm_type="minmax", connection=bq_connection + alpha=50.0, + beta=150.0, + norm_type="minmax", + connection=bq_connection, + engine="opencv", ) assert isinstance(actual, bpd.Series) @@ -322,7 +336,7 @@ def test_blob_pdf_extract( ): actual = ( pdf_mm_df["pdf"] - .blob.pdf_extract(connection=bq_connection, verbose=verbose) + .blob.pdf_extract(connection=bq_connection, verbose=verbose, engine="pypdf") .explode() .to_pandas() ) @@ -373,7 +387,11 @@ def test_blob_pdf_chunk( actual = ( pdf_mm_df["pdf"] .blob.pdf_chunk( - connection=bq_connection, chunk_size=50, overlap_size=10, verbose=verbose + connection=bq_connection, + chunk_size=50, + overlap_size=10, + verbose=verbose, + engine="pypdf", ) .explode() .to_pandas() diff --git a/tests/system/large/operations/test_ai.py b/tests/system/large/operations/test_ai.py index afd135591f..ded5e0b588 100644 --- a/tests/system/large/operations/test_ai.py +++ b/tests/system/large/operations/test_ai.py @@ -27,17 +27,6 @@ THRESHOLD_OPTION = "compute.ai_ops_confirmation_threshold" -def test_ai_experiment_off_raise_error(): - df = dataframe.DataFrame( - {"country": ["USA", "Germany"], "city": ["Seattle", "Berlin"]} - ) - - with bigframes.option_context(AI_OP_EXP_OPTION, False), pytest.raises( - NotImplementedError - ): - df.ai - - def test_filter(session, gemini_flash_model): df = dataframe.DataFrame( data={ diff --git a/tests/system/small/engines/test_sorting.py b/tests/system/small/engines/test_sorting.py new file mode 100644 index 0000000000..d1929afa44 --- /dev/null +++ b/tests/system/small/engines/test_sorting.py @@ -0,0 +1,103 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +from bigframes.core import array_value, nodes, ordering +import bigframes.operations as bf_ops +from bigframes.session import polars_executor +from bigframes.testing.engine_utils import assert_equivalence_execution + +pytest.importorskip("polars") + +# Polars used as reference as its fast and local. Generally though, prefer gbq engine where they disagree. +REFERENCE_ENGINE = polars_executor.PolarsExecutor() + + +@pytest.mark.parametrize("engine", ["polars", "bq"], indirect=True) +def test_engines_reverse( + scalars_array_value: array_value.ArrayValue, + engine, +): + node = apply_reverse(scalars_array_value.node) + assert_equivalence_execution(node, REFERENCE_ENGINE, engine) + + +@pytest.mark.parametrize("engine", ["polars", "bq"], indirect=True) +def test_engines_double_reverse( + scalars_array_value: array_value.ArrayValue, + engine, +): + node = apply_reverse(scalars_array_value.node) + assert_equivalence_execution(node, REFERENCE_ENGINE, engine) + + +@pytest.mark.parametrize("engine", ["polars", "bq"], indirect=True) +@pytest.mark.parametrize( + "sort_col", + [ + "bool_col", + "int64_col", + "bytes_col", + "date_col", + "datetime_col", + "int64_col", + "int64_too", + "numeric_col", + "float64_col", + "string_col", + "time_col", + "timestamp_col", + ], +) +def test_engines_sort_over_column( + scalars_array_value: array_value.ArrayValue, engine, sort_col +): + node = apply_reverse(scalars_array_value.node) + ORDER_EXPRESSIONS = (ordering.descending_over(sort_col, nulls_last=False),) + node = nodes.OrderByNode(node, ORDER_EXPRESSIONS) + assert_equivalence_execution(node, REFERENCE_ENGINE, engine) + + +@pytest.mark.parametrize("engine", ["polars", "bq"], indirect=True) +def test_engines_sort_multi_column_refs( + scalars_array_value: array_value.ArrayValue, + engine, +): + node = scalars_array_value.node + ORDER_EXPRESSIONS = ( + ordering.ascending_over("bool_col", nulls_last=False), + ordering.descending_over("int64_col"), + ) + node = nodes.OrderByNode(node, ORDER_EXPRESSIONS) + assert_equivalence_execution(node, REFERENCE_ENGINE, engine) + + +@pytest.mark.parametrize("engine", ["polars"], indirect=True) +def test_polars_engines_skips_unrecognized_order_expr( + scalars_array_value: array_value.ArrayValue, + engine, +): + node = scalars_array_value.node + ORDER_EXPRESSIONS = ( + ordering.OrderingExpression( + scalar_expression=bf_ops.sin_op.as_expr("float_col") + ), + ) + node = nodes.OrderByNode(node, ORDER_EXPRESSIONS) + assert engine.execute(node, ordered=True) is None + + +def apply_reverse(node: nodes.BigFrameNode) -> nodes.BigFrameNode: + return nodes.ReversedNode(node) diff --git a/tests/system/small/operations/test_ai.py b/tests/system/small/operations/test_ai.py index 83aca8b5b1..771b7b47d3 100644 --- a/tests/system/small/operations/test_ai.py +++ b/tests/system/small/operations/test_ai.py @@ -23,9 +23,19 @@ import bigframes from bigframes import dataframe, dtypes from bigframes.ml import llm +import bigframes.operations.ai +from bigframes.testing import utils AI_OP_EXP_OPTION = "experiments.ai_operators" THRESHOLD_OPTION = "compute.ai_ops_confirmation_threshold" +AI_FORECAST_COLUMNS = [ + "forecast_timestamp", + "forecast_value", + "confidence_level", + "prediction_interval_lower_bound", + "prediction_interval_upper_bound", + "ai_forecast_status", +] class FakeGeminiTextGenerator(llm.GeminiTextGenerator): @@ -36,7 +46,47 @@ def predict(self, *args, **kwargs): return self.prediction -def test_experiment_off_raise_error(session): +@pytest.mark.parametrize( + ("func", "kwargs"), + [ + pytest.param( + bigframes.operations.ai.AIAccessor.filter, + {"instruction": None, "model": None}, + id="filter", + ), + pytest.param( + bigframes.operations.ai.AIAccessor.map, + {"instruction": None, "model": None}, + id="map", + ), + pytest.param( + bigframes.operations.ai.AIAccessor.classify, + {"instruction": None, "model": None, "labels": None}, + id="classify", + ), + pytest.param( + bigframes.operations.ai.AIAccessor.join, + {"other": None, "instruction": None, "model": None}, + id="join", + ), + pytest.param( + bigframes.operations.ai.AIAccessor.search, + {"search_column": None, "query": None, "top_k": None, "model": None}, + id="search", + ), + pytest.param( + bigframes.operations.ai.AIAccessor.top_k, + {"instruction": None, "model": None}, + id="top_k", + ), + pytest.param( + bigframes.operations.ai.AIAccessor.sim_join, + {"other": None, "left_on": None, "right_on": None, "model": None}, + id="sim_join", + ), + ], +) +def test_experiment_off_raise_error(session, func, kwargs): df = dataframe.DataFrame( {"country": ["USA", "Germany"], "city": ["Seattle", "Berlin"]}, session=session ) @@ -44,7 +94,7 @@ def test_experiment_off_raise_error(session): with bigframes.option_context(AI_OP_EXP_OPTION, False), pytest.raises( NotImplementedError ): - df.ai + func(df.ai, **kwargs) def test_filter(session): @@ -216,6 +266,34 @@ def test_top_k(session): assert len(result) == 1 +def test_forecast_default(time_series_df_default_index: dataframe.DataFrame): + df = time_series_df_default_index[time_series_df_default_index["id"] == "1"] + + result = df.ai.forecast(timestamp_column="parsed_date", data_column="total_visits") + + utils.check_pandas_df_schema_and_index( + result, + columns=AI_FORECAST_COLUMNS, + index=10, + ) + + +def test_forecast_w_params(time_series_df_default_index: dataframe.DataFrame): + result = time_series_df_default_index.ai.forecast( + timestamp_column="parsed_date", + data_column="total_visits", + id_columns=["id"], + horizon=20, + confidence_level=0.98, + ) + + utils.check_pandas_df_schema_and_index( + result, + columns=["id"] + AI_FORECAST_COLUMNS, + index=20 * 2, # 20 for each id + ) + + def _create_dummy_full_response(row_count: int) -> pd.Series: entry = """{"candidates": [{"avg_logprobs": -0.5}]}""" diff --git a/tests/system/small/pandas/core/methods/__init__.py b/tests/system/small/pandas/core/methods/__init__.py new file mode 100644 index 0000000000..0a2669d7a2 --- /dev/null +++ b/tests/system/small/pandas/core/methods/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/system/small/pandas/core/methods/test_describe.py b/tests/system/small/pandas/core/methods/test_describe.py new file mode 100644 index 0000000000..dfc7c3fb23 --- /dev/null +++ b/tests/system/small/pandas/core/methods/test_describe.py @@ -0,0 +1,226 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pandas.testing +import pytest + + +def test_df_describe_non_temporal(scalars_dfs): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") + scalars_df, scalars_pandas_df = scalars_dfs + # excluding temporal columns here because BigFrames cannot perform percentiles operations on them + unsupported_columns = ["datetime_col", "timestamp_col", "time_col", "date_col"] + bf_result = scalars_df.drop(columns=unsupported_columns).describe().to_pandas() + + modified_pd_df = scalars_pandas_df.drop(columns=unsupported_columns) + pd_result = modified_pd_df.describe() + + # Pandas may produce narrower numeric types, but bigframes always produces Float64 + pd_result = pd_result.astype("Float64") + + # Drop quartiles, as they are approximate + bf_min = bf_result.loc["min", :] + bf_p25 = bf_result.loc["25%", :] + bf_p50 = bf_result.loc["50%", :] + bf_p75 = bf_result.loc["75%", :] + bf_max = bf_result.loc["max", :] + + bf_result = bf_result.drop(labels=["25%", "50%", "75%"]) + pd_result = pd_result.drop(labels=["25%", "50%", "75%"]) + + pandas.testing.assert_frame_equal(pd_result, bf_result, check_index_type=False) + + # Double-check that quantiles are at least plausible. + assert ( + (bf_min <= bf_p25) + & (bf_p25 <= bf_p50) + & (bf_p50 <= bf_p50) + & (bf_p75 <= bf_max) + ).all() + + +@pytest.mark.parametrize("include", [None, "all"]) +def test_df_describe_non_numeric(scalars_dfs, include): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") + scalars_df, scalars_pandas_df = scalars_dfs + + # Excluding "date_col" here because in BigFrames it is used as PyArrow[date32()], which is + # considered numerical in Pandas + target_columns = ["string_col", "bytes_col", "bool_col", "time_col"] + + modified_bf = scalars_df[target_columns] + bf_result = modified_bf.describe(include=include).to_pandas() + + modified_pd_df = scalars_pandas_df[target_columns] + pd_result = modified_pd_df.describe(include=include) + + # Reindex results with the specified keys and their order, because + # the relative order is not important. + bf_result = bf_result.reindex(["count", "nunique"]) + pd_result = pd_result.reindex( + ["count", "unique"] + # BF counter part of "unique" is called "nunique" + ).rename(index={"unique": "nunique"}) + + pandas.testing.assert_frame_equal( + pd_result.astype("Int64"), + bf_result, + check_index_type=False, + ) + + +def test_df_describe_temporal(scalars_dfs): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") + scalars_df, scalars_pandas_df = scalars_dfs + + temporal_columns = ["datetime_col", "timestamp_col", "time_col", "date_col"] + + modified_bf = scalars_df[temporal_columns] + bf_result = modified_bf.describe(include="all").to_pandas() + + modified_pd_df = scalars_pandas_df[temporal_columns] + pd_result = modified_pd_df.describe(include="all") + + # Reindex results with the specified keys and their order, because + # the relative order is not important. + bf_result = bf_result.reindex(["count", "nunique"]) + pd_result = pd_result.reindex( + ["count", "unique"] + # BF counter part of "unique" is called "nunique" + ).rename(index={"unique": "nunique"}) + + pandas.testing.assert_frame_equal( + pd_result.astype("Float64"), + bf_result.astype("Float64"), + check_index_type=False, + ) + + +def test_df_describe_mixed_types_include_all(scalars_dfs): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") + scalars_df, scalars_pandas_df = scalars_dfs + + numeric_columns = [ + "int64_col", + "float64_col", + ] + non_numeric_columns = ["string_col"] + supported_columns = numeric_columns + non_numeric_columns + + modified_bf = scalars_df[supported_columns] + bf_result = modified_bf.describe(include="all").to_pandas() + + modified_pd_df = scalars_pandas_df[supported_columns] + pd_result = modified_pd_df.describe(include="all") + + # Drop quartiles, as they are approximate + bf_min = bf_result.loc["min", :] + bf_p25 = bf_result.loc["25%", :] + bf_p50 = bf_result.loc["50%", :] + bf_p75 = bf_result.loc["75%", :] + bf_max = bf_result.loc["max", :] + + # Reindex results with the specified keys and their order, because + # the relative order is not important. + bf_result = bf_result.reindex(["count", "nunique", "mean", "std", "min", "max"]) + pd_result = pd_result.reindex( + ["count", "unique", "mean", "std", "min", "max"] + # BF counter part of "unique" is called "nunique" + ).rename(index={"unique": "nunique"}) + + pandas.testing.assert_frame_equal( + pd_result[numeric_columns].astype("Float64"), + bf_result[numeric_columns], + check_index_type=False, + ) + + pandas.testing.assert_frame_equal( + pd_result[non_numeric_columns].astype("Int64"), + bf_result[non_numeric_columns], + check_index_type=False, + ) + + # Double-check that quantiles are at least plausible. + assert ( + (bf_min <= bf_p25) + & (bf_p25 <= bf_p50) + & (bf_p50 <= bf_p50) + & (bf_p75 <= bf_max) + ).all() + + +def test_series_describe_numeric(scalars_dfs): + target_col = "int64_col" + bf_df, pd_df = scalars_dfs + bf_s, pd_s = bf_df[target_col], pd_df[target_col] + + bf_result = ( + bf_s.describe() + .to_pandas() + .reindex(["count", "nunique", "mean", "std", "min", "max"]) + ) + pd_result = ( + pd_s.describe() + .reindex(["count", "unique", "mean", "std", "min", "max"]) + .rename(index={"unique": "nunique"}) + ) + + pandas.testing.assert_series_equal( + bf_result, + pd_result, + check_dtype=False, + check_index_type=False, + ) + + +def test_series_describe_non_numeric(scalars_dfs): + target_col = "string_col" + bf_df, pd_df = scalars_dfs + bf_s, pd_s = bf_df[target_col], pd_df[target_col] + + bf_result = bf_s.describe().to_pandas().reindex(["count", "nunique"]) + pd_result = ( + pd_s.describe().reindex(["count", "unique"]).rename(index={"unique": "nunique"}) + ) + + pandas.testing.assert_series_equal( + bf_result, + pd_result, + check_dtype=False, + check_index_type=False, + ) + + +def test_series_describe_temporal(scalars_dfs): + # Pandas returns for unique timestamps only after 2.1.0 + pytest.importorskip("pandas", minversion="2.1.0") + target_col = "timestamp_col" + bf_df, pd_df = scalars_dfs + bf_s, pd_s = bf_df[target_col], pd_df[target_col] + + bf_result = bf_s.describe().to_pandas().reindex(["count", "nunique"]) + pd_result = ( + pd_s.describe().reindex(["count", "unique"]).rename(index={"unique": "nunique"}) + ) + + pandas.testing.assert_series_equal( + bf_result, + pd_result, + check_dtype=False, + check_index_type=False, + ) diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 946df79cbf..b037c6f371 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -810,7 +810,7 @@ def test_repr_html_w_all_rows(scalars_dfs, session): + f"[{len(pandas_df.index)} rows x {len(pandas_df.columns)} columns in total]" ) assert actual == expected - assert (executions_post - executions_pre) <= 2 + assert (executions_post - executions_pre) <= 3 def test_df_column_name_with_space(scalars_dfs): @@ -3121,154 +3121,6 @@ def test_dataframe_agg_int_multi_string(scalars_dfs): ) -def test_df_describe_non_temporal(scalars_dfs): - # TODO: supply a reason why this isn't compatible with pandas 1.x - pytest.importorskip("pandas", minversion="2.0.0") - scalars_df, scalars_pandas_df = scalars_dfs - # excluding temporal columns here because BigFrames cannot perform percentiles operations on them - unsupported_columns = ["datetime_col", "timestamp_col", "time_col", "date_col"] - bf_result = scalars_df.drop(columns=unsupported_columns).describe().to_pandas() - - modified_pd_df = scalars_pandas_df.drop(columns=unsupported_columns) - pd_result = modified_pd_df.describe() - - # Pandas may produce narrower numeric types, but bigframes always produces Float64 - pd_result = pd_result.astype("Float64") - - # Drop quartiles, as they are approximate - bf_min = bf_result.loc["min", :] - bf_p25 = bf_result.loc["25%", :] - bf_p50 = bf_result.loc["50%", :] - bf_p75 = bf_result.loc["75%", :] - bf_max = bf_result.loc["max", :] - - bf_result = bf_result.drop(labels=["25%", "50%", "75%"]) - pd_result = pd_result.drop(labels=["25%", "50%", "75%"]) - - pd.testing.assert_frame_equal(pd_result, bf_result, check_index_type=False) - - # Double-check that quantiles are at least plausible. - assert ( - (bf_min <= bf_p25) - & (bf_p25 <= bf_p50) - & (bf_p50 <= bf_p50) - & (bf_p75 <= bf_max) - ).all() - - -@pytest.mark.parametrize("include", [None, "all"]) -def test_df_describe_non_numeric(scalars_dfs, include): - # TODO: supply a reason why this isn't compatible with pandas 1.x - pytest.importorskip("pandas", minversion="2.0.0") - scalars_df, scalars_pandas_df = scalars_dfs - - # Excluding "date_col" here because in BigFrames it is used as PyArrow[date32()], which is - # considered numerical in Pandas - target_columns = ["string_col", "bytes_col", "bool_col", "time_col"] - - modified_bf = scalars_df[target_columns] - bf_result = modified_bf.describe(include=include).to_pandas() - - modified_pd_df = scalars_pandas_df[target_columns] - pd_result = modified_pd_df.describe(include=include) - - # Reindex results with the specified keys and their order, because - # the relative order is not important. - bf_result = bf_result.reindex(["count", "nunique"]) - pd_result = pd_result.reindex( - ["count", "unique"] - # BF counter part of "unique" is called "nunique" - ).rename(index={"unique": "nunique"}) - - pd.testing.assert_frame_equal( - pd_result.astype("Int64"), - bf_result, - check_index_type=False, - ) - - -def test_df_describe_temporal(scalars_dfs): - # TODO: supply a reason why this isn't compatible with pandas 1.x - pytest.importorskip("pandas", minversion="2.0.0") - scalars_df, scalars_pandas_df = scalars_dfs - - temporal_columns = ["datetime_col", "timestamp_col", "time_col", "date_col"] - - modified_bf = scalars_df[temporal_columns] - bf_result = modified_bf.describe(include="all").to_pandas() - - modified_pd_df = scalars_pandas_df[temporal_columns] - pd_result = modified_pd_df.describe(include="all") - - # Reindex results with the specified keys and their order, because - # the relative order is not important. - bf_result = bf_result.reindex(["count", "nunique"]) - pd_result = pd_result.reindex( - ["count", "unique"] - # BF counter part of "unique" is called "nunique" - ).rename(index={"unique": "nunique"}) - - pd.testing.assert_frame_equal( - pd_result.astype("Float64"), - bf_result.astype("Float64"), - check_index_type=False, - ) - - -def test_df_describe_mixed_types_include_all(scalars_dfs): - # TODO: supply a reason why this isn't compatible with pandas 1.x - pytest.importorskip("pandas", minversion="2.0.0") - scalars_df, scalars_pandas_df = scalars_dfs - - numeric_columns = [ - "int64_col", - "float64_col", - ] - non_numeric_columns = ["string_col"] - supported_columns = numeric_columns + non_numeric_columns - - modified_bf = scalars_df[supported_columns] - bf_result = modified_bf.describe(include="all").to_pandas() - - modified_pd_df = scalars_pandas_df[supported_columns] - pd_result = modified_pd_df.describe(include="all") - - # Drop quartiles, as they are approximate - bf_min = bf_result.loc["min", :] - bf_p25 = bf_result.loc["25%", :] - bf_p50 = bf_result.loc["50%", :] - bf_p75 = bf_result.loc["75%", :] - bf_max = bf_result.loc["max", :] - - # Reindex results with the specified keys and their order, because - # the relative order is not important. - bf_result = bf_result.reindex(["count", "nunique", "mean", "std", "min", "max"]) - pd_result = pd_result.reindex( - ["count", "unique", "mean", "std", "min", "max"] - # BF counter part of "unique" is called "nunique" - ).rename(index={"unique": "nunique"}) - - pd.testing.assert_frame_equal( - pd_result[numeric_columns].astype("Float64"), - bf_result[numeric_columns], - check_index_type=False, - ) - - pd.testing.assert_frame_equal( - pd_result[non_numeric_columns].astype("Int64"), - bf_result[non_numeric_columns], - check_index_type=False, - ) - - # Double-check that quantiles are at least plausible. - assert ( - (bf_min <= bf_p25) - & (bf_p25 <= bf_p50) - & (bf_p50 <= bf_p50) - & (bf_p75 <= bf_max) - ).all() - - def test_df_transpose(): # Include some floats to ensure type coercion values = [[0, 3.5, True], [1, 4.5, False], [2, 6.5, None]] diff --git a/tests/system/small/test_pandas_options.py b/tests/system/small/test_pandas_options.py index 55e5036a42..1d360e0d4f 100644 --- a/tests/system/small/test_pandas_options.py +++ b/tests/system/small/test_pandas_options.py @@ -18,11 +18,10 @@ import warnings import google.api_core.exceptions -import google.auth -import google.auth.exceptions +import pandas.testing import pytest -import bigframes.core.global_session +import bigframes.exceptions import bigframes.pandas as bpd @@ -327,3 +326,45 @@ def test_credentials_need_reauthentication( # Now verify that use is able to start over df = bpd.read_gbq(test_query) assert df is not None + + +def test_max_rows_normal_execution_within_limit( + scalars_df_index, scalars_pandas_df_index +): + """Test queries execute normally when the number of rows is within the limit.""" + with bpd.option_context("compute.maximum_result_rows", 10): + df = scalars_df_index.head(10) + result = df.to_pandas() + + expected = scalars_pandas_df_index.head(10) + pandas.testing.assert_frame_equal(result, expected) + + with bpd.option_context("compute.maximum_result_rows", 10), bpd.option_context( + "display.repr_mode", "head" + ): + df = scalars_df_index.head(10) + assert repr(df) is not None + + # We should be able to get away with only a single row for shape. + with bpd.option_context("compute.maximum_result_rows", 1): + shape = scalars_df_index.shape + assert shape == scalars_pandas_df_index.shape + + # 0 is not recommended, as it would stop aggregations and many other + # necessary operations, but we shouldn't need even 1 row for to_gbq(). + with bpd.option_context("compute.maximum_result_rows", 0): + destination = scalars_df_index.to_gbq() + assert destination is not None + + +def test_max_rows_exceeds_limit(scalars_df_index): + """Test to_pandas() raises MaximumRowsDownloadedExceeded when the limit is exceeded.""" + with bpd.option_context("compute.maximum_result_rows", 5), pytest.raises( + bigframes.exceptions.MaximumResultRowsExceeded, match="5" + ): + scalars_df_index.to_pandas() + + with bpd.option_context("compute.maximum_result_rows", 5), pytest.raises( + bigframes.exceptions.MaximumResultRowsExceeded, match="5" + ): + next(iter(scalars_df_index.to_pandas_batches())) diff --git a/tests/system/small/test_progress_bar.py b/tests/system/small/test_progress_bar.py index 35b540966e..1e35a2f80f 100644 --- a/tests/system/small/test_progress_bar.py +++ b/tests/system/small/test_progress_bar.py @@ -17,6 +17,7 @@ import numpy as np import pandas as pd +import pytest import bigframes as bf import bigframes.formatting_helpers as formatting_helpers @@ -164,3 +165,18 @@ def test_query_job_dry_run_series(penguins_df_default_index: bf.dataframe.DataFr with bf.option_context("display.repr_mode", "deferred"): series_result = repr(penguins_df_default_index["body_mass_g"]) assert EXPECTED_DRY_RUN_MESSAGE in series_result + + +def test_repr_anywidget_dataframe(penguins_df_default_index: bf.dataframe.DataFrame): + pytest.importorskip("anywidget") + with bf.option_context("display.repr_mode", "anywidget"): + actual_repr = repr(penguins_df_default_index) + assert EXPECTED_DRY_RUN_MESSAGE in actual_repr + + +def test_repr_anywidget_idex(penguins_df_default_index: bf.dataframe.DataFrame): + pytest.importorskip("anywidget") + with bf.option_context("display.repr_mode", "anywidget"): + index = penguins_df_default_index.index + actual_repr = repr(index) + assert EXPECTED_DRY_RUN_MESSAGE in actual_repr diff --git a/tests/unit/core/compile/sqlglot/conftest.py b/tests/unit/core/compile/sqlglot/conftest.py index d9a13ae53f..20f8159f01 100644 --- a/tests/unit/core/compile/sqlglot/conftest.py +++ b/tests/unit/core/compile/sqlglot/conftest.py @@ -21,6 +21,7 @@ import pytest from bigframes import dtypes +import bigframes.pandas as bpd import bigframes.testing.mocks as mocks import bigframes.testing.utils @@ -29,26 +30,52 @@ @pytest.fixture(scope="session") -def compiler_session(basic_types_table_schema): +def compiler_session(scalars_types_table_schema): from bigframes.testing import compiler_session # TODO: Check if ordering mode is needed for the tests. - session = mocks.create_bigquery_session(table_schema=basic_types_table_schema) + table_name = "scalar_types" + anonymous_dataset = bigquery.DatasetReference.from_string( + "bigframes-dev.sqlglot_test" + ) + session = mocks.create_bigquery_session( + table_name=table_name, + table_schema=scalars_types_table_schema, + anonymous_dataset=anonymous_dataset, + ) session._executor = compiler_session.SQLCompilerExecutor() return session @pytest.fixture(scope="session") -def basic_types_table_schema() -> typing.Sequence[bigquery.SchemaField]: +def scalars_types_table_schema() -> typing.Sequence[bigquery.SchemaField]: return [ - bigquery.SchemaField("rowindex", "INTEGER"), + bigquery.SchemaField("bool_col", "BOOLEAN"), + bigquery.SchemaField("bytes_col", "BYTES"), + bigquery.SchemaField("date_col", "DATE"), + bigquery.SchemaField("datetime_col", "DATETIME"), + bigquery.SchemaField("geography_col", "GEOGRAPHY"), bigquery.SchemaField("int64_col", "INTEGER"), - bigquery.SchemaField("string_col", "STRING"), + bigquery.SchemaField("int64_too", "INTEGER"), + bigquery.SchemaField("numeric_col", "NUMERIC"), bigquery.SchemaField("float64_col", "FLOAT"), - bigquery.SchemaField("bool_col", "BOOLEAN"), + bigquery.SchemaField("rowindex", "INTEGER"), + bigquery.SchemaField("rowindex_2", "INTEGER"), + bigquery.SchemaField("string_col", "STRING"), + bigquery.SchemaField("time_col", "TIME"), + bigquery.SchemaField("timestamp_col", "TIMESTAMP"), ] +@pytest.fixture(scope="session") +def scalars_types_df(compiler_session) -> bpd.DataFrame: + """Returns a BigFrames DataFrame containing all scalar types and using the `rowindex` + column as the index.""" + bf_df = compiler_session.read_gbq_table("bigframes-dev.sqlglot_test.scalar_types") + bf_df = bf_df.set_index("rowindex", drop=False) + return bf_df + + @pytest.fixture(scope="session") def scalars_types_pandas_df() -> pd.DataFrame: """Returns a pandas DataFrame containing all scalar types and using the `rowindex` diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_concat/test_compile_concat/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_concat/test_compile_concat/out.sql index 855e5874c2..8da545b8fa 100644 --- a/tests/unit/core/compile/sqlglot/snapshots/test_compile_concat/test_compile_concat/out.sql +++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_concat/test_compile_concat/out.sql @@ -4,79 +4,47 @@ WITH `bfcte_1` AS ( FROM UNNEST(ARRAY>[STRUCT(0, 123456789, 0, 'Hello, World!', 0), STRUCT(1, -987654321, 1, 'こんにちは', 1), STRUCT(2, 314159, 2, ' ¡Hola Mundo! ', 2), STRUCT(3, CAST(NULL AS INT64), 3, CAST(NULL AS STRING), 3), STRUCT(4, -234892, 4, 'Hello, World!', 4), STRUCT(5, 55555, 5, 'Güten Tag!', 5), STRUCT(6, 101202303, 6, 'capitalize, This ', 6), STRUCT(7, -214748367, 7, ' سلام', 7), STRUCT(8, 2, 8, 'T', 8)]) ), `bfcte_3` AS ( SELECT - `bfcol_0` AS `bfcol_5`, - `bfcol_2` AS `bfcol_6`, - `bfcol_1` AS `bfcol_7`, - `bfcol_3` AS `bfcol_8`, - `bfcol_4` AS `bfcol_9` + *, + `bfcol_4` AS `bfcol_10` FROM `bfcte_1` ), `bfcte_5` AS ( - SELECT - *, - `bfcol_9` AS `bfcol_10` - FROM `bfcte_3` -), `bfcte_7` AS ( - SELECT - `bfcol_5` AS `bfcol_11`, - `bfcol_6` AS `bfcol_12`, - `bfcol_7` AS `bfcol_13`, - `bfcol_8` AS `bfcol_14`, - `bfcol_10` AS `bfcol_15` - FROM `bfcte_5` -), `bfcte_9` AS ( SELECT *, 0 AS `bfcol_16` - FROM `bfcte_7` -), `bfcte_10` AS ( + FROM `bfcte_3` +), `bfcte_6` AS ( SELECT - `bfcol_11` AS `bfcol_17`, - `bfcol_12` AS `bfcol_18`, - `bfcol_13` AS `bfcol_19`, - `bfcol_14` AS `bfcol_20`, + `bfcol_0` AS `bfcol_17`, + `bfcol_2` AS `bfcol_18`, + `bfcol_1` AS `bfcol_19`, + `bfcol_3` AS `bfcol_20`, `bfcol_16` AS `bfcol_21`, - `bfcol_15` AS `bfcol_22` - FROM `bfcte_9` + `bfcol_10` AS `bfcol_22` + FROM `bfcte_5` ), `bfcte_0` AS ( SELECT * FROM UNNEST(ARRAY>[STRUCT(0, 123456789, 0, 'Hello, World!', 0), STRUCT(1, -987654321, 1, 'こんにちは', 1), STRUCT(2, 314159, 2, ' ¡Hola Mundo! ', 2), STRUCT(3, CAST(NULL AS INT64), 3, CAST(NULL AS STRING), 3), STRUCT(4, -234892, 4, 'Hello, World!', 4), STRUCT(5, 55555, 5, 'Güten Tag!', 5), STRUCT(6, 101202303, 6, 'capitalize, This ', 6), STRUCT(7, -214748367, 7, ' سلام', 7), STRUCT(8, 2, 8, 'T', 8)]) ), `bfcte_2` AS ( SELECT - `bfcol_23` AS `bfcol_28`, - `bfcol_25` AS `bfcol_29`, - `bfcol_24` AS `bfcol_30`, - `bfcol_26` AS `bfcol_31`, - `bfcol_27` AS `bfcol_32` + *, + `bfcol_27` AS `bfcol_33` FROM `bfcte_0` ), `bfcte_4` AS ( SELECT *, - `bfcol_32` AS `bfcol_33` + 1 AS `bfcol_39` FROM `bfcte_2` -), `bfcte_6` AS ( +), `bfcte_7` AS ( SELECT - `bfcol_28` AS `bfcol_34`, - `bfcol_29` AS `bfcol_35`, - `bfcol_30` AS `bfcol_36`, - `bfcol_31` AS `bfcol_37`, - `bfcol_33` AS `bfcol_38` + `bfcol_23` AS `bfcol_40`, + `bfcol_25` AS `bfcol_41`, + `bfcol_24` AS `bfcol_42`, + `bfcol_26` AS `bfcol_43`, + `bfcol_39` AS `bfcol_44`, + `bfcol_33` AS `bfcol_45` FROM `bfcte_4` ), `bfcte_8` AS ( - SELECT - *, - 1 AS `bfcol_39` - FROM `bfcte_6` -), `bfcte_11` AS ( - SELECT - `bfcol_34` AS `bfcol_40`, - `bfcol_35` AS `bfcol_41`, - `bfcol_36` AS `bfcol_42`, - `bfcol_37` AS `bfcol_43`, - `bfcol_39` AS `bfcol_44`, - `bfcol_38` AS `bfcol_45` - FROM `bfcte_8` -), `bfcte_12` AS ( SELECT * FROM ( @@ -87,7 +55,7 @@ WITH `bfcte_1` AS ( bfcol_20 AS `bfcol_49`, bfcol_21 AS `bfcol_50`, bfcol_22 AS `bfcol_51` - FROM `bfcte_10` + FROM `bfcte_6` UNION ALL SELECT bfcol_40 AS `bfcol_46`, @@ -96,7 +64,7 @@ WITH `bfcte_1` AS ( bfcol_43 AS `bfcol_49`, bfcol_44 AS `bfcol_50`, bfcol_45 AS `bfcol_51` - FROM `bfcte_11` + FROM `bfcte_7` ) ) SELECT @@ -104,7 +72,7 @@ SELECT `bfcol_47` AS `rowindex_1`, `bfcol_48` AS `int64_col`, `bfcol_49` AS `string_col` -FROM `bfcte_12` +FROM `bfcte_8` ORDER BY `bfcol_50` ASC NULLS LAST, `bfcol_51` ASC NULLS LAST \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_projection/test_compile_projection/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_projection/test_compile_projection/out.sql index 2804925b2d..3f819800e5 100644 --- a/tests/unit/core/compile/sqlglot/snapshots/test_compile_projection/test_compile_projection/out.sql +++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_projection/test_compile_projection/out.sql @@ -1,33 +1,26 @@ WITH `bfcte_0` AS ( SELECT - `rowindex` AS `bfcol_0`, - `int64_col` AS `bfcol_1`, - `string_col` AS `bfcol_2`, - `float64_col` AS `bfcol_3`, - `bool_col` AS `bfcol_4` - FROM `test-project`.`test_dataset`.`test_table` + `int64_col` AS `bfcol_0`, + `rowindex` AS `bfcol_1` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` ), `bfcte_1` AS ( SELECT - *, - `bfcol_0` AS `bfcol_5`, - `bfcol_2` AS `bfcol_6`, - `bfcol_3` AS `bfcol_7`, - `bfcol_4` AS `bfcol_8`, - `bfcol_1` + 1 AS `bfcol_9` + `bfcol_1` AS `bfcol_2`, + `bfcol_0` AS `bfcol_3` FROM `bfcte_0` ), `bfcte_2` AS ( SELECT - `bfcol_5` AS `bfcol_10`, - `bfcol_9` AS `bfcol_11`, - `bfcol_6` AS `bfcol_12`, - `bfcol_7` AS `bfcol_13`, - `bfcol_8` AS `bfcol_14` + *, + `bfcol_2` AS `bfcol_4`, + `bfcol_3` + 1 AS `bfcol_5` FROM `bfcte_1` +), `bfcte_3` AS ( + SELECT + `bfcol_4` AS `bfcol_6`, + `bfcol_5` AS `bfcol_7` + FROM `bfcte_2` ) SELECT - `bfcol_10` AS `rowindex`, - `bfcol_11` AS `int64_col`, - `bfcol_12` AS `string_col`, - `bfcol_13` AS `float64_col`, - `bfcol_14` AS `bool_col` -FROM `bfcte_2` \ No newline at end of file + `bfcol_6` AS `rowindex`, + `bfcol_7` AS `int64_col` +FROM `bfcte_3` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal/out.sql index 89c51b346d..70d73db6a7 100644 --- a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal/out.sql +++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal/out.sql @@ -155,42 +155,23 @@ WITH `bfcte_0` AS ( CAST(NULL AS TIMESTAMP), 8 )]) -), `bfcte_1` AS ( - SELECT - `bfcol_0` AS `bfcol_16`, - `bfcol_1` AS `bfcol_17`, - `bfcol_2` AS `bfcol_18`, - `bfcol_3` AS `bfcol_19`, - `bfcol_4` AS `bfcol_20`, - `bfcol_5` AS `bfcol_21`, - `bfcol_6` AS `bfcol_22`, - `bfcol_7` AS `bfcol_23`, - `bfcol_8` AS `bfcol_24`, - `bfcol_9` AS `bfcol_25`, - `bfcol_10` AS `bfcol_26`, - `bfcol_11` AS `bfcol_27`, - `bfcol_12` AS `bfcol_28`, - `bfcol_13` AS `bfcol_29`, - `bfcol_14` AS `bfcol_30`, - `bfcol_15` AS `bfcol_31` - FROM `bfcte_0` ) SELECT - `bfcol_16` AS `rowindex`, - `bfcol_17` AS `bool_col`, - `bfcol_18` AS `bytes_col`, - `bfcol_19` AS `date_col`, - `bfcol_20` AS `datetime_col`, - `bfcol_21` AS `geography_col`, - `bfcol_22` AS `int64_col`, - `bfcol_23` AS `int64_too`, - `bfcol_24` AS `numeric_col`, - `bfcol_25` AS `float64_col`, - `bfcol_26` AS `rowindex_1`, - `bfcol_27` AS `rowindex_2`, - `bfcol_28` AS `string_col`, - `bfcol_29` AS `time_col`, - `bfcol_30` AS `timestamp_col` -FROM `bfcte_1` + `bfcol_0` AS `rowindex`, + `bfcol_1` AS `bool_col`, + `bfcol_2` AS `bytes_col`, + `bfcol_3` AS `date_col`, + `bfcol_4` AS `datetime_col`, + `bfcol_5` AS `geography_col`, + `bfcol_6` AS `int64_col`, + `bfcol_7` AS `int64_too`, + `bfcol_8` AS `numeric_col`, + `bfcol_9` AS `float64_col`, + `bfcol_10` AS `rowindex_1`, + `bfcol_11` AS `rowindex_2`, + `bfcol_12` AS `string_col`, + `bfcol_13` AS `time_col`, + `bfcol_14` AS `timestamp_col` +FROM `bfcte_0` ORDER BY - `bfcol_31` ASC NULLS LAST \ No newline at end of file + `bfcol_15` ASC NULLS LAST \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_json_df/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_json_df/out.sql index 76cbde7c64..100036d75f 100644 --- a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_json_df/out.sql +++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_json_df/out.sql @@ -2,14 +2,9 @@ WITH `bfcte_0` AS ( SELECT * FROM UNNEST(ARRAY>[STRUCT(PARSE_JSON('null'), 0), STRUCT(PARSE_JSON('true'), 1), STRUCT(PARSE_JSON('100'), 2), STRUCT(PARSE_JSON('0.98'), 3), STRUCT(PARSE_JSON('"a string"'), 4), STRUCT(PARSE_JSON('[]'), 5), STRUCT(PARSE_JSON('[1,2,3]'), 6), STRUCT(PARSE_JSON('[{"a":1},{"a":2},{"a":null},{}]'), 7), STRUCT(PARSE_JSON('"100"'), 8), STRUCT(PARSE_JSON('{"date":"2024-07-16"}'), 9), STRUCT(PARSE_JSON('{"int_value":2,"null_filed":null}'), 10), STRUCT(PARSE_JSON('{"list_data":[10,20,30]}'), 11)]) -), `bfcte_1` AS ( - SELECT - `bfcol_0` AS `bfcol_2`, - `bfcol_1` AS `bfcol_3` - FROM `bfcte_0` ) SELECT - `bfcol_2` AS `json_col` -FROM `bfcte_1` + `bfcol_0` AS `json_col` +FROM `bfcte_0` ORDER BY - `bfcol_3` ASC NULLS LAST \ No newline at end of file + `bfcol_1` ASC NULLS LAST \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_lists_df/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_lists_df/out.sql index 6363739d9d..923476aafd 100644 --- a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_lists_df/out.sql +++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_lists_df/out.sql @@ -32,28 +32,16 @@ WITH `bfcte_0` AS ( ['', 'a'], 2 )]) -), `bfcte_1` AS ( - SELECT - `bfcol_0` AS `bfcol_9`, - `bfcol_1` AS `bfcol_10`, - `bfcol_2` AS `bfcol_11`, - `bfcol_3` AS `bfcol_12`, - `bfcol_4` AS `bfcol_13`, - `bfcol_5` AS `bfcol_14`, - `bfcol_6` AS `bfcol_15`, - `bfcol_7` AS `bfcol_16`, - `bfcol_8` AS `bfcol_17` - FROM `bfcte_0` ) SELECT - `bfcol_9` AS `rowindex`, - `bfcol_10` AS `int_list_col`, - `bfcol_11` AS `bool_list_col`, - `bfcol_12` AS `float_list_col`, - `bfcol_13` AS `date_list_col`, - `bfcol_14` AS `date_time_list_col`, - `bfcol_15` AS `numeric_list_col`, - `bfcol_16` AS `string_list_col` -FROM `bfcte_1` + `bfcol_0` AS `rowindex`, + `bfcol_1` AS `int_list_col`, + `bfcol_2` AS `bool_list_col`, + `bfcol_3` AS `float_list_col`, + `bfcol_4` AS `date_list_col`, + `bfcol_5` AS `date_time_list_col`, + `bfcol_6` AS `numeric_list_col`, + `bfcol_7` AS `string_list_col` +FROM `bfcte_0` ORDER BY - `bfcol_17` ASC NULLS LAST \ No newline at end of file + `bfcol_8` ASC NULLS LAST \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_structs_df/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_structs_df/out.sql index af7206b759..7ded9cf5ff 100644 --- a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_structs_df/out.sql +++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_structs_df/out.sql @@ -18,16 +18,10 @@ WITH `bfcte_0` AS ( ), 1 )]) -), `bfcte_1` AS ( - SELECT - `bfcol_0` AS `bfcol_3`, - `bfcol_1` AS `bfcol_4`, - `bfcol_2` AS `bfcol_5` - FROM `bfcte_0` ) SELECT - `bfcol_3` AS `id`, - `bfcol_4` AS `person` -FROM `bfcte_1` + `bfcol_0` AS `id`, + `bfcol_1` AS `person` +FROM `bfcte_0` ORDER BY - `bfcol_5` ASC NULLS LAST \ No newline at end of file + `bfcol_2` ASC NULLS LAST \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable/out.sql index a5cb399b40..34fc8e3c49 100644 --- a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable/out.sql +++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable/out.sql @@ -1,16 +1,35 @@ WITH `bfcte_0` AS ( SELECT - `rowindex` AS `bfcol_0`, - `int64_col` AS `bfcol_1`, - `string_col` AS `bfcol_2`, - `float64_col` AS `bfcol_3`, - `bool_col` AS `bfcol_4` - FROM `test-project`.`test_dataset`.`test_table` + `bool_col` AS `bfcol_0`, + `bytes_col` AS `bfcol_1`, + `date_col` AS `bfcol_2`, + `datetime_col` AS `bfcol_3`, + `geography_col` AS `bfcol_4`, + `int64_col` AS `bfcol_5`, + `int64_too` AS `bfcol_6`, + `numeric_col` AS `bfcol_7`, + `float64_col` AS `bfcol_8`, + `rowindex` AS `bfcol_9`, + `rowindex_2` AS `bfcol_10`, + `string_col` AS `bfcol_11`, + `time_col` AS `bfcol_12`, + `timestamp_col` AS `bfcol_13` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` ) SELECT - `bfcol_0` AS `rowindex`, - `bfcol_1` AS `int64_col`, - `bfcol_2` AS `string_col`, - `bfcol_3` AS `float64_col`, - `bfcol_4` AS `bool_col` + `bfcol_9` AS `rowindex`, + `bfcol_0` AS `bool_col`, + `bfcol_1` AS `bytes_col`, + `bfcol_2` AS `date_col`, + `bfcol_3` AS `datetime_col`, + `bfcol_4` AS `geography_col`, + `bfcol_5` AS `int64_col`, + `bfcol_6` AS `int64_too`, + `bfcol_7` AS `numeric_col`, + `bfcol_8` AS `float64_col`, + `bfcol_9` AS `rowindex_1`, + `bfcol_10` AS `rowindex_2`, + `bfcol_11` AS `string_col`, + `bfcol_12` AS `time_col`, + `bfcol_13` AS `timestamp_col` FROM `bfcte_0` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable_w_limit/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable_w_limit/out.sql index c5724c8442..f97eb7bf06 100644 --- a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable_w_limit/out.sql +++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable_w_limit/out.sql @@ -1,18 +1,12 @@ WITH `bfcte_0` AS ( SELECT - `rowindex` AS `bfcol_0`, - `int64_col` AS `bfcol_1`, - `string_col` AS `bfcol_2`, - `float64_col` AS `bfcol_3`, - `bool_col` AS `bfcol_4` - FROM `test-project`.`test_dataset`.`test_table` + `int64_col` AS `bfcol_0`, + `rowindex` AS `bfcol_1` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` ) SELECT - `bfcol_0` AS `rowindex`, - `bfcol_1` AS `int64_col`, - `bfcol_2` AS `string_col`, - `bfcol_3` AS `float64_col`, - `bfcol_4` AS `bool_col` + `bfcol_1` AS `rowindex`, + `bfcol_0` AS `int64_col` FROM `bfcte_0` ORDER BY `bfcol_1` ASC NULLS LAST diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable_w_ordering/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable_w_ordering/out.sql index 238659cc01..6a16b98baa 100644 --- a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable_w_ordering/out.sql +++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable_w_ordering/out.sql @@ -1,18 +1,12 @@ WITH `bfcte_0` AS ( SELECT - `rowindex` AS `bfcol_0`, - `int64_col` AS `bfcol_1`, - `string_col` AS `bfcol_2`, - `float64_col` AS `bfcol_3`, - `bool_col` AS `bfcol_4` - FROM `test-project`.`test_dataset`.`test_table` + `int64_col` AS `bfcol_0`, + `rowindex` AS `bfcol_1` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` ) SELECT - `bfcol_0` AS `rowindex`, - `bfcol_1` AS `int64_col`, - `bfcol_2` AS `string_col`, - `bfcol_3` AS `float64_col`, - `bfcol_4` AS `bool_col` + `bfcol_1` AS `rowindex`, + `bfcol_0` AS `int64_col` FROM `bfcte_0` ORDER BY `bfcol_0` ASC NULLS LAST \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_scalar_expr/test_compile_numerical_add/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_scalar_expr/test_compile_numerical_add/out.sql index 405b02d897..1496f89f28 100644 --- a/tests/unit/core/compile/sqlglot/snapshots/test_compile_scalar_expr/test_compile_numerical_add/out.sql +++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_scalar_expr/test_compile_numerical_add/out.sql @@ -1,33 +1,16 @@ WITH `bfcte_0` AS ( SELECT - `rowindex` AS `bfcol_0`, - `int64_col` AS `bfcol_1`, - `string_col` AS `bfcol_2`, - `float64_col` AS `bfcol_3`, - `bool_col` AS `bfcol_4` - FROM `test-project`.`test_dataset`.`test_table` + `int64_col` AS `bfcol_0`, + `rowindex` AS `bfcol_1` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` ), `bfcte_1` AS ( SELECT *, - `bfcol_0` AS `bfcol_5`, - `bfcol_2` AS `bfcol_6`, - `bfcol_3` AS `bfcol_7`, - `bfcol_4` AS `bfcol_8`, - `bfcol_1` + `bfcol_1` AS `bfcol_9` + `bfcol_1` AS `bfcol_4`, + `bfcol_0` + `bfcol_0` AS `bfcol_5` FROM `bfcte_0` -), `bfcte_2` AS ( - SELECT - `bfcol_5` AS `bfcol_10`, - `bfcol_9` AS `bfcol_11`, - `bfcol_6` AS `bfcol_12`, - `bfcol_7` AS `bfcol_13`, - `bfcol_8` AS `bfcol_14` - FROM `bfcte_1` ) SELECT - `bfcol_10` AS `rowindex`, - `bfcol_11` AS `int64_col`, - `bfcol_12` AS `string_col`, - `bfcol_13` AS `float64_col`, - `bfcol_14` AS `bool_col` -FROM `bfcte_2` \ No newline at end of file + `bfcol_4` AS `rowindex`, + `bfcol_5` AS `int64_col` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_scalar_expr/test_compile_numerical_add_w_scalar/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_scalar_expr/test_compile_numerical_add_w_scalar/out.sql new file mode 100644 index 0000000000..9c4b01a6df --- /dev/null +++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_scalar_expr/test_compile_numerical_add_w_scalar/out.sql @@ -0,0 +1,16 @@ +WITH `bfcte_0` AS ( + SELECT + `int64_col` AS `bfcol_0`, + `rowindex` AS `bfcol_1` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + `bfcol_1` AS `bfcol_4`, + `bfcol_0` + 1 AS `bfcol_5` + FROM `bfcte_0` +) +SELECT + `bfcol_4` AS `rowindex`, + `bfcol_5` AS `int64_col` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_scalar_expr/test_compile_string_add/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_scalar_expr/test_compile_string_add/out.sql index 49ec5435f9..7a8ab83df1 100644 --- a/tests/unit/core/compile/sqlglot/snapshots/test_compile_scalar_expr/test_compile_string_add/out.sql +++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_scalar_expr/test_compile_string_add/out.sql @@ -1,33 +1,16 @@ WITH `bfcte_0` AS ( SELECT `rowindex` AS `bfcol_0`, - `int64_col` AS `bfcol_1`, - `string_col` AS `bfcol_2`, - `float64_col` AS `bfcol_3`, - `bool_col` AS `bfcol_4` - FROM `test-project`.`test_dataset`.`test_table` + `string_col` AS `bfcol_1` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` ), `bfcte_1` AS ( SELECT *, - `bfcol_0` AS `bfcol_5`, - `bfcol_1` AS `bfcol_6`, - `bfcol_3` AS `bfcol_7`, - `bfcol_4` AS `bfcol_8`, - CONCAT(`bfcol_2`, 'a') AS `bfcol_9` + `bfcol_0` AS `bfcol_4`, + CONCAT(`bfcol_1`, 'a') AS `bfcol_5` FROM `bfcte_0` -), `bfcte_2` AS ( - SELECT - `bfcol_5` AS `bfcol_10`, - `bfcol_6` AS `bfcol_11`, - `bfcol_9` AS `bfcol_12`, - `bfcol_7` AS `bfcol_13`, - `bfcol_8` AS `bfcol_14` - FROM `bfcte_1` ) SELECT - `bfcol_10` AS `rowindex`, - `bfcol_11` AS `int64_col`, - `bfcol_12` AS `string_col`, - `bfcol_13` AS `float64_col`, - `bfcol_14` AS `bool_col` -FROM `bfcte_2` \ No newline at end of file + `bfcol_4` AS `rowindex`, + `bfcol_5` AS `string_col` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/test_compile_readtable.py b/tests/unit/core/compile/sqlglot/test_compile_readtable.py index 41e01e9b25..63849f093c 100644 --- a/tests/unit/core/compile/sqlglot/test_compile_readtable.py +++ b/tests/unit/core/compile/sqlglot/test_compile_readtable.py @@ -14,23 +14,22 @@ import pytest -import bigframes +import bigframes.pandas as bpd pytest.importorskip("pytest_snapshot") -def test_compile_readtable(compiler_session: bigframes.Session, snapshot): - bf_df = compiler_session.read_gbq_table("test-project.test_dataset.test_table") - snapshot.assert_match(bf_df.sql, "out.sql") +def test_compile_readtable(scalars_types_df: bpd.DataFrame, snapshot): + snapshot.assert_match(scalars_types_df.sql, "out.sql") -def test_compile_readtable_w_ordering(compiler_session: bigframes.Session, snapshot): - bf_df = compiler_session.read_gbq_table("test-project.test_dataset.test_table") - bf_df = bf_df.set_index("rowindex").sort_index() +def test_compile_readtable_w_ordering(scalars_types_df: bpd.DataFrame, snapshot): + bf_df = scalars_types_df[["int64_col"]] + bf_df = bf_df.sort_values("int64_col") snapshot.assert_match(bf_df.sql, "out.sql") -def test_compile_readtable_w_limit(compiler_session: bigframes.Session, snapshot): - bf_df = compiler_session.read_gbq_table("test-project.test_dataset.test_table") - bf_df = bf_df.sort_values("int64_col").head(10) +def test_compile_readtable_w_limit(scalars_types_df: bpd.DataFrame, snapshot): + bf_df = scalars_types_df[["int64_col"]] + bf_df = bf_df.sort_index().head(10) snapshot.assert_match(bf_df.sql, "out.sql") diff --git a/tests/unit/core/compile/sqlglot/test_compile_scalar_expr.py b/tests/unit/core/compile/sqlglot/test_compile_scalar_expr.py index ebdb82477f..862ee2467c 100644 --- a/tests/unit/core/compile/sqlglot/test_compile_scalar_expr.py +++ b/tests/unit/core/compile/sqlglot/test_compile_scalar_expr.py @@ -14,18 +14,24 @@ import pytest -import bigframes +import bigframes.pandas as bpd pytest.importorskip("pytest_snapshot") -def test_compile_numerical_add(compiler_session: bigframes.Session, snapshot): - bf_df = compiler_session.read_gbq_table("test-project.test_dataset.test_table") +def test_compile_numerical_add(scalars_types_df: bpd.DataFrame, snapshot): + bf_df = scalars_types_df[["int64_col"]] bf_df["int64_col"] = bf_df["int64_col"] + bf_df["int64_col"] snapshot.assert_match(bf_df.sql, "out.sql") -def test_compile_string_add(compiler_session: bigframes.Session, snapshot): - bf_df = compiler_session.read_gbq_table("test-project.test_dataset.test_table") +def test_compile_numerical_add_w_scalar(scalars_types_df: bpd.DataFrame, snapshot): + bf_df = scalars_types_df[["int64_col"]] + bf_df["int64_col"] = bf_df["int64_col"] + 1 + snapshot.assert_match(bf_df.sql, "out.sql") + + +def test_compile_string_add(scalars_types_df: bpd.DataFrame, snapshot): + bf_df = scalars_types_df[["string_col"]] bf_df["string_col"] = bf_df["string_col"] + "a" snapshot.assert_match(bf_df.sql, "out.sql") diff --git a/tests/unit/core/test_bf_utils.py b/tests/unit/core/test_bf_utils.py index 9b4c4f8742..6fb796329f 100644 --- a/tests/unit/core/test_bf_utils.py +++ b/tests/unit/core/test_bf_utils.py @@ -59,7 +59,7 @@ def test_get_standardized_ids_tuple(): col_ids, _ = utils.get_standardized_ids(col_labels) - assert col_ids == ["('foo', 1)", "('foo', 2)", "('bar', 1)"] + assert col_ids == ["_'foo'_ 1_", "_'foo'_ 2_", "_'bar'_ 1_"] @pytest.mark.parametrize( diff --git a/tests/unit/ml/test_sql.py b/tests/unit/ml/test_sql.py index 5a7220fc38..d605b571f3 100644 --- a/tests/unit/ml/test_sql.py +++ b/tests/unit/ml/test_sql.py @@ -155,6 +155,33 @@ def test_polynomial_expand( assert sql == "ML.POLYNOMIAL_EXPAND(STRUCT(`col_a`, `col_b`), 2) AS `poly_exp`" +def test_ai_forecast_correct( + base_sql_generator: ml_sql.BaseSqlGenerator, + mock_df: bpd.DataFrame, +): + sql = base_sql_generator.ai_forecast( + source_sql=mock_df.sql, + options={ + "model": "TimesFM 2.0", + "data_col": "data1", + "timestamp_col": "time1", + "id_cols": ("id1", "id2"), + "horizon": 10, + "confidence_level": 0.95, + }, + ) + assert ( + sql + == """SELECT * FROM AI.FORECAST((input_X_y_sql), + model => 'TimesFM 2.0', + data_col => 'data1', + timestamp_col => 'time1', + id_cols => ['id1', 'id2'], + horizon => 10, + confidence_level => 0.95)""" + ) + + def test_create_model_correct( model_creation_sql_generator: ml_sql.ModelCreationSqlGenerator, mock_df: bpd.DataFrame, diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 6c927a5c26..224fe25f16 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -6222,7 +6222,7 @@ def agg(self, func): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - def describe(self): + def describe(self, include: None | Literal["all"] = None): """ Generate descriptive statistics. @@ -6230,7 +6230,10 @@ def describe(self): tendency, dispersion and shape of a dataset's distribution, excluding ``NaN`` values. - Only supports numeric columns. + Args: + include ("all" or None, optional): + If "all": All columns of the input will be included in the output. + If None: The result will include all numeric columns. .. note:: Percentile values are approximates only. @@ -6247,28 +6250,44 @@ def describe(self): >>> import bigframes.pandas as bpd >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame({"A": [3, 1, 2], "B": [0, 2, 8]}) + >>> df = bpd.DataFrame({"A": [3, 1, 2], "B": [0, 2, 8], "C": ["cat", "cat", "dog"]}) >>> df - A B - 0 3 0 - 1 1 2 - 2 2 8 + A B C + 0 3 0 cat + 1 1 2 cat + 2 2 8 dog - [3 rows x 2 columns] + [3 rows x 3 columns] >>> df.describe() - A B - count 3.0 3.0 - mean 2.0 3.333333 - std 1.0 4.163332 - min 1.0 0.0 - 25% 1.0 0.0 - 50% 2.0 2.0 - 75% 3.0 8.0 - max 3.0 8.0 + A B + count 3.0 3.0 + mean 2.0 3.333333 + std 1.0 4.163332 + min 1.0 0.0 + 25% 1.0 0.0 + 50% 2.0 2.0 + 75% 3.0 8.0 + max 3.0 8.0 [8 rows x 2 columns] + + Using describe with include = "all": + >>> df.describe(include="all") + A B C + count 3.0 3.0 3 + nunique 2 + mean 2.0 3.333333 + std 1.0 4.163332 + min 1.0 0.0 + 25% 1.0 0.0 + 50% 2.0 2.0 + 75% 3.0 8.0 + max 3.0 8.0 + + [9 rows x 3 columns] + Returns: bigframes.pandas.DataFrame: Summary statistics of the Series or Dataframe provided. diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index b2846d675c..0160a7eb50 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -4850,6 +4850,47 @@ def prod(self): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def describe(self): + """ + Generate descriptive statistics. + + Descriptive statistics include those that summarize the central + tendency, dispersion and shape of a + dataset's distribution, excluding ``NaN`` values. + + .. note:: + Percentile values are approximates only. + + .. note:: + For numeric data, the result's index will include ``count``, + ``mean``, ``std``, ``min``, ``max`` as well as lower, ``50`` and + upper percentiles. By default the lower percentile is ``25`` and the + upper percentile is ``75``. The ``50`` percentile is the + same as the median. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series(['A', 'A', 'B']) + >>> s + 0 A + 1 A + 2 B + dtype: string + + >>> s.describe() + count 3 + nunique 2 + Name: 0, dtype: Int64 + + Returns: + bigframes.pandas.Series: + Summary statistics of the Series. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def skew(self): """Return unbiased skew over requested axis. diff --git a/third_party/bigframes_vendored/version.py b/third_party/bigframes_vendored/version.py index 138c380d0c..5d2de2f97f 100644 --- a/third_party/bigframes_vendored/version.py +++ b/third_party/bigframes_vendored/version.py @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "2.7.0" +__version__ = "2.8.0" # {x-release-please-start-date} -__release_date__ = "2025-06-16" +__release_date__ = "2025-06-23" # {x-release-please-end}