diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 7e46c73d0d..93cc5e4210 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -39,6 +39,6 @@ repos: rev: v1.15.0 hooks: - id: mypy - additional_dependencies: [types-requests, types-tabulate, pandas-stubs<=2.2.3.241126] + additional_dependencies: [types-requests, types-tabulate, types-PyYAML, pandas-stubs<=2.2.3.241126] exclude: "^third_party" args: ["--check-untyped-defs", "--explicit-package-bases", "--ignore-missing-imports"] diff --git a/CHANGELOG.md b/CHANGELOG.md index f649f2f8a4..313064241d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,34 @@ [1]: https://pypi.org/project/bigframes/#history +## [2.9.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v2.8.0...v2.9.0) (2025-06-30) + + +### Features + +* Add `bpd.read_arrow` to convert an Arrow object into a bigframes DataFrame ([#1855](https://github.com/googleapis/python-bigquery-dataframes/issues/1855)) ([633bf98](https://github.com/googleapis/python-bigquery-dataframes/commit/633bf98fde33264be4fc9d7454e541c560589152)) +* Add experimental polars execution ([#1747](https://github.com/googleapis/python-bigquery-dataframes/issues/1747)) ([daf0c3b](https://github.com/googleapis/python-bigquery-dataframes/commit/daf0c3b349fb1e85e7070c54a2d3f5460f5e40c9)) +* Add size op support in local engine ([#1865](https://github.com/googleapis/python-bigquery-dataframes/issues/1865)) ([942e66c](https://github.com/googleapis/python-bigquery-dataframes/commit/942e66c483c9afbb680a7af56c9e9a76172a33e1)) +* Create `deploy_remote_function` and `deploy_udf` functions to immediately deploy functions to BigQuery ([#1832](https://github.com/googleapis/python-bigquery-dataframes/issues/1832)) ([c706759](https://github.com/googleapis/python-bigquery-dataframes/commit/c706759b85359b6d23ce3449f6ab138ad2d22f9d)) +* Support index item assign in Series ([#1868](https://github.com/googleapis/python-bigquery-dataframes/issues/1868)) ([c5d251a](https://github.com/googleapis/python-bigquery-dataframes/commit/c5d251a1d454bb4ef55ea9905faeadd646a23b14)) +* Support item assignment in series ([#1859](https://github.com/googleapis/python-bigquery-dataframes/issues/1859)) ([25684ff](https://github.com/googleapis/python-bigquery-dataframes/commit/25684ff60367f49dd318d4677a7438abdc98bff9)) +* Support local execution of comparison ops ([#1849](https://github.com/googleapis/python-bigquery-dataframes/issues/1849)) ([1c45ccb](https://github.com/googleapis/python-bigquery-dataframes/commit/1c45ccb133091aa85bc34450704fc8cab3d9296b)) + + +### Bug Fixes + +* Fix bug selecting column repeatedly ([#1858](https://github.com/googleapis/python-bigquery-dataframes/issues/1858)) ([cc339e9](https://github.com/googleapis/python-bigquery-dataframes/commit/cc339e9938129cac896460e3a794b3ec8479fa4a)) +* Fix bug with DataFrame.agg for string values ([#1870](https://github.com/googleapis/python-bigquery-dataframes/issues/1870)) ([81e4d64](https://github.com/googleapis/python-bigquery-dataframes/commit/81e4d64c5a3bd8d30edaf909d0bef2d1d1a51c01)) +* Generate GoogleSQL instead of legacy SQL data types for `dry_run=True` from `bpd._read_gbq_colab` with local pandas DataFrame ([#1867](https://github.com/googleapis/python-bigquery-dataframes/issues/1867)) ([fab3c38](https://github.com/googleapis/python-bigquery-dataframes/commit/fab3c387b2ad66043244fa813a366e613b41c60f)) +* Revert dict back to protobuf in the iam binding update ([#1838](https://github.com/googleapis/python-bigquery-dataframes/issues/1838)) ([9fb3cb4](https://github.com/googleapis/python-bigquery-dataframes/commit/9fb3cb444607df6736d383a2807059bca470c453)) + + +### Documentation + +* Add data visualization samples for public doc ([#1847](https://github.com/googleapis/python-bigquery-dataframes/issues/1847)) ([15e1277](https://github.com/googleapis/python-bigquery-dataframes/commit/15e1277b1413de18a5e36f72959a99701d6df08b)) +* Changed broken logo ([#1866](https://github.com/googleapis/python-bigquery-dataframes/issues/1866)) ([e3c06b4](https://github.com/googleapis/python-bigquery-dataframes/commit/e3c06b4a07d0669a42460d081f1582b681ae3dd5)) +* Update ai.forecast notebook ([#1844](https://github.com/googleapis/python-bigquery-dataframes/issues/1844)) ([1863538](https://github.com/googleapis/python-bigquery-dataframes/commit/186353888db537b561ee994256f998df361b4071)) + ## [2.8.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v2.7.0...v2.8.0) (2025-06-23) diff --git a/bigframes/_config/bigquery_options.py b/bigframes/_config/bigquery_options.py index d591ea85b3..09ffee95d4 100644 --- a/bigframes/_config/bigquery_options.py +++ b/bigframes/_config/bigquery_options.py @@ -22,6 +22,7 @@ import google.auth.credentials import requests.adapters +import bigframes._importing import bigframes.enums import bigframes.exceptions as bfe @@ -94,6 +95,7 @@ def __init__( requests_transport_adapters: Sequence[ Tuple[str, requests.adapters.BaseAdapter] ] = (), + enable_polars_execution: bool = False, ): self._credentials = credentials self._project = project @@ -113,6 +115,9 @@ def __init__( client_endpoints_override = {} self._client_endpoints_override = client_endpoints_override + if enable_polars_execution: + bigframes._importing.import_polars() + self._enable_polars_execution = enable_polars_execution @property def application_name(self) -> Optional[str]: @@ -424,3 +429,22 @@ def requests_transport_adapters( SESSION_STARTED_MESSAGE.format(attribute="requests_transport_adapters") ) self._requests_transport_adapters = value + + @property + def enable_polars_execution(self) -> bool: + """If True, will use polars to execute some simple query plans locally.""" + return self._enable_polars_execution + + @enable_polars_execution.setter + def enable_polars_execution(self, value: bool): + if self._session_started and self._enable_polars_execution != value: + raise ValueError( + SESSION_STARTED_MESSAGE.format(attribute="enable_polars_execution") + ) + if value is True: + msg = bfe.format_message( + "Polars execution is an experimental feature, and may not be stable. Must have polars installed." + ) + warnings.warn(msg, category=bfe.PreviewWarning) + bigframes._importing.import_polars() + self._enable_polars_execution = value diff --git a/bigframes/_importing.py b/bigframes/_importing.py new file mode 100644 index 0000000000..095a1d9c51 --- /dev/null +++ b/bigframes/_importing.py @@ -0,0 +1,30 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import importlib +from types import ModuleType + +from packaging import version + +# Keep this in sync with setup.py +POLARS_MIN_VERSION = version.Version("1.7.0") + + +def import_polars() -> ModuleType: + polars_module = importlib.import_module("polars") + imported_version = version.Version(polars_module.build_info()["version"]) + if imported_version < POLARS_MIN_VERSION: + raise ImportError( + f"Imported polars version: {imported_version} is below the minimum version: {POLARS_MIN_VERSION}" + ) + return polars_module diff --git a/bigframes/clients.py b/bigframes/clients.py index f1f6d686fd..e6ddd5c6cb 100644 --- a/bigframes/clients.py +++ b/bigframes/clients.py @@ -24,6 +24,7 @@ import google.api_core.exceptions import google.api_core.retry from google.cloud import bigquery_connection_v1, resourcemanager_v3 +from google.iam.v1 import policy_pb2 logger = logging.getLogger(__name__) @@ -172,10 +173,7 @@ def _ensure_iam_binding( return # Create a new binding - new_binding = { - "role": role, - "members": [service_account], - } # Use a dictionary to avoid problematic google.iam namespace package. + new_binding = policy_pb2.Binding(role=role, members=[service_account]) policy.bindings.append(new_binding) request = { "resource": project, diff --git a/bigframes/core/array_value.py b/bigframes/core/array_value.py index 4b05781cb7..b47637cb59 100644 --- a/bigframes/core/array_value.py +++ b/bigframes/core/array_value.py @@ -330,12 +330,27 @@ def create_constant( return self.project_to_id(ex.const(value, dtype)) - def select_columns(self, column_ids: typing.Sequence[str]) -> ArrayValue: + def select_columns( + self, column_ids: typing.Sequence[str], allow_renames: bool = False + ) -> ArrayValue: # This basically just drops and reorders columns - logically a no-op except as a final step - selections = ( - bigframes.core.nodes.AliasedRef.identity(ids.ColumnId(col_id)) - for col_id in column_ids - ) + selections = [] + seen = set() + + for id in column_ids: + if id not in seen: + ref = nodes.AliasedRef.identity(ids.ColumnId(id)) + elif allow_renames: + ref = nodes.AliasedRef( + ex.deref(id), ids.ColumnId(bigframes.core.guid.generate_guid()) + ) + else: + raise ValueError( + "Must set allow_renames=True to select columns repeatedly" + ) + selections.append(ref) + seen.add(id) + return ArrayValue( nodes.SelectionNode( child=self.node, diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 675e8c8b7a..6d476cc795 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -50,6 +50,7 @@ import bigframes.core.identifiers import bigframes.core.join_def as join_defs import bigframes.core.ordering as ordering +import bigframes.core.pyarrow_utils as pyarrow_utils import bigframes.core.schema as bf_schema import bigframes.core.sql as sql import bigframes.core.utils as utils @@ -156,6 +157,36 @@ def __init__( self._view_ref: Optional[bigquery.TableReference] = None self._view_ref_dry_run: Optional[bigquery.TableReference] = None + @classmethod + def from_pyarrow( + cls, + data: pa.Table, + session: bigframes.Session, + ) -> Block: + column_labels = data.column_names + + # TODO(tswast): Use array_value.promote_offsets() instead once that node is + # supported by the local engine. + offsets_col = bigframes.core.guid.generate_guid() + index_ids = [offsets_col] + index_labels = [None] + + # TODO(https://github.com/googleapis/python-bigquery-dataframes/issues/859): + # Allow users to specify the "total ordering" column(s) or allow multiple + # such columns. + data = pyarrow_utils.append_offsets(data, offsets_col=offsets_col) + + # from_pyarrow will normalize the types for us. + managed_data = local_data.ManagedArrowTable.from_pyarrow(data) + array_value = core.ArrayValue.from_managed(managed_data, session=session) + block = cls( + array_value, + column_labels=column_labels, + index_columns=index_ids, + index_labels=index_labels, + ) + return block + @classmethod def from_local( cls, @@ -1210,7 +1241,10 @@ def select_column(self, id: str) -> Block: return self.select_columns([id]) def select_columns(self, ids: typing.Sequence[str]) -> Block: - expr = self._expr.select_columns([*self.index_columns, *ids]) + # Allow renames as may end up selecting same columns multiple times + expr = self._expr.select_columns( + [*self.index_columns, *ids], allow_renames=True + ) col_labels = self._get_labels_for_columns(ids) return Block(expr, self.index_columns, col_labels, self.index.names) @@ -1996,7 +2030,7 @@ def _generate_resample_label( return block.set_index([resample_label_id]) def _create_stack_column(self, col_label: typing.Tuple, stack_labels: pd.Index): - dtype = None + input_dtypes = [] input_columns: list[Optional[str]] = [] for uvalue in utils.index_as_tuples(stack_labels): label_to_match = (*col_label, *uvalue) @@ -2006,15 +2040,18 @@ def _create_stack_column(self, col_label: typing.Tuple, stack_labels: pd.Index): matching_ids = self.label_to_col_id.get(label_to_match, []) input_id = matching_ids[0] if len(matching_ids) > 0 else None if input_id: - if dtype and dtype != self._column_type(input_id): - raise NotImplementedError( - "Cannot stack columns with non-matching dtypes." - ) - else: - dtype = self._column_type(input_id) + input_dtypes.append(self._column_type(input_id)) input_columns.append(input_id) # Input column i is the first one that - return tuple(input_columns), dtype or pd.Float64Dtype() + if len(input_dtypes) > 0: + output_dtype = bigframes.dtypes.lcd_type(*input_dtypes) + if output_dtype is None: + raise NotImplementedError( + "Cannot stack columns with non-matching dtypes." + ) + else: + output_dtype = pd.Float64Dtype() + return tuple(input_columns), output_dtype def _column_type(self, col_id: str) -> bigframes.dtypes.Dtype: col_offset = self.value_columns.index(col_id) diff --git a/bigframes/core/compile/polars/compiler.py b/bigframes/core/compile/polars/compiler.py index 62654c1518..6b76f3f53d 100644 --- a/bigframes/core/compile/polars/compiler.py +++ b/bigframes/core/compile/polars/compiler.py @@ -23,9 +23,11 @@ import bigframes.core from bigframes.core import identifiers, nodes, ordering, window_spec +from bigframes.core.compile.polars import lowering import bigframes.core.expression as ex import bigframes.core.guid as guid import bigframes.core.rewrite +import bigframes.core.rewrite.schema_binding import bigframes.dtypes import bigframes.operations as ops import bigframes.operations.aggregations as agg_ops @@ -391,7 +393,7 @@ class PolarsCompiler: expr_compiler = PolarsExpressionCompiler() agg_compiler = PolarsAggregateCompiler() - def compile(self, array_value: bigframes.core.ArrayValue) -> pl.LazyFrame: + def compile(self, plan: nodes.BigFrameNode) -> pl.LazyFrame: if not polars_installed: raise ValueError( "Polars is not installed, cannot compile to polars engine." @@ -399,10 +401,12 @@ def compile(self, array_value: bigframes.core.ArrayValue) -> pl.LazyFrame: # TODO: Create standard way to configure BFET -> BFET rewrites # Polars has incomplete slice support in lazy mode - node = array_value.node + node = plan node = bigframes.core.rewrite.column_pruning(node) node = nodes.bottom_up(node, bigframes.core.rewrite.rewrite_slice) node = bigframes.core.rewrite.pull_out_window_order(node) + node = bigframes.core.rewrite.schema_binding.bind_schema_to_tree(node) + node = lowering.lower_ops_to_polars(node) return self.compile_node(node) @functools.singledispatchmethod diff --git a/bigframes/core/compile/polars/lowering.py b/bigframes/core/compile/polars/lowering.py new file mode 100644 index 0000000000..48d63e9ed9 --- /dev/null +++ b/bigframes/core/compile/polars/lowering.py @@ -0,0 +1,98 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import dataclasses + +from bigframes import dtypes +from bigframes.core import bigframe_node, expression +from bigframes.core.rewrite import op_lowering +from bigframes.operations import comparison_ops, numeric_ops +import bigframes.operations as ops + +# TODO: Would be more precise to actually have separate op set for polars ops (where they diverge from the original ops) + + +@dataclasses.dataclass +class CoerceArgsRule(op_lowering.OpLoweringRule): + op_type: type[ops.BinaryOp] + + @property + def op(self) -> type[ops.ScalarOp]: + return self.op_type + + def lower(self, expr: expression.OpExpression) -> expression.Expression: + assert isinstance(expr.op, self.op_type) + larg, rarg = _coerce_comparables(expr.children[0], expr.children[1]) + return expr.op.as_expr(larg, rarg) + + +class LowerFloorDivRule(op_lowering.OpLoweringRule): + @property + def op(self) -> type[ops.ScalarOp]: + return numeric_ops.FloorDivOp + + def lower(self, expr: expression.OpExpression) -> expression.Expression: + dividend = expr.children[0] + divisor = expr.children[1] + using_floats = (dividend.output_type == dtypes.FLOAT_DTYPE) or ( + divisor.output_type == dtypes.FLOAT_DTYPE + ) + inf_or_zero = ( + expression.const(float("INF")) if using_floats else expression.const(0) + ) + zero_result = ops.mul_op.as_expr(inf_or_zero, dividend) + divisor_is_zero = ops.eq_op.as_expr(divisor, expression.const(0)) + return ops.where_op.as_expr(zero_result, divisor_is_zero, expr) + + +def _coerce_comparables(expr1: expression.Expression, expr2: expression.Expression): + + target_type = dtypes.coerce_to_common(expr1.output_type, expr2.output_type) + if expr1.output_type != target_type: + expr1 = _lower_cast(ops.AsTypeOp(target_type), expr1) + if expr2.output_type != target_type: + expr2 = _lower_cast(ops.AsTypeOp(target_type), expr2) + return expr1, expr2 + + +# TODO: Need to handle bool->string cast to get capitalization correct +def _lower_cast(cast_op: ops.AsTypeOp, arg: expression.Expression): + if arg.output_type == dtypes.BOOL_DTYPE and dtypes.is_numeric(cast_op.to_type): + # bool -> decimal needs two-step cast + new_arg = ops.AsTypeOp(to_type=dtypes.INT_DTYPE).as_expr(arg) + return cast_op.as_expr(new_arg) + return cast_op.as_expr(arg) + + +LOWER_COMPARISONS = tuple( + CoerceArgsRule(op) + for op in ( + comparison_ops.EqOp, + comparison_ops.EqNullsMatchOp, + comparison_ops.NeOp, + comparison_ops.LtOp, + comparison_ops.GtOp, + comparison_ops.LeOp, + comparison_ops.GeOp, + ) +) + +POLARS_LOWERING_RULES = ( + *LOWER_COMPARISONS, + LowerFloorDivRule(), +) + + +def lower_ops_to_polars(root: bigframe_node.BigFrameNode) -> bigframe_node.BigFrameNode: + return op_lowering.lower_ops(root, rules=POLARS_LOWERING_RULES) diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index 075089bb7a..30da6b2cb2 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -1498,6 +1498,7 @@ def eq_op( x: ibis_types.Value, y: ibis_types.Value, ): + x, y = _coerce_comparables(x, y) return x == y @@ -1507,6 +1508,7 @@ def eq_nulls_match_op( y: ibis_types.Value, ): """Variant of eq_op where nulls match each other. Only use where dtypes are known to be same.""" + x, y = _coerce_comparables(x, y) literal = ibis_types.literal("$NULL_SENTINEL$") if hasattr(x, "fill_null"): left = x.cast(ibis_dtypes.str).fill_null(literal) @@ -1523,6 +1525,7 @@ def ne_op( x: ibis_types.Value, y: ibis_types.Value, ): + x, y = _coerce_comparables(x, y) return x != y @@ -1534,6 +1537,17 @@ def _null_or_value(value: ibis_types.Value, where_value: ibis_types.BooleanValue ) +def _coerce_comparables( + x: ibis_types.Value, + y: ibis_types.Value, +): + if x.type().is_boolean() and not y.type().is_boolean(): + x = x.cast(ibis_dtypes.int64) + elif y.type().is_boolean() and not x.type().is_boolean(): + y = y.cast(ibis_dtypes.int64) + return x, y + + @scalar_op_compiler.register_binary_op(ops.and_op) def and_op( x: ibis_types.Value, @@ -1735,6 +1749,7 @@ def lt_op( x: ibis_types.Value, y: ibis_types.Value, ): + x, y = _coerce_comparables(x, y) return x < y @@ -1744,6 +1759,7 @@ def le_op( x: ibis_types.Value, y: ibis_types.Value, ): + x, y = _coerce_comparables(x, y) return x <= y @@ -1753,6 +1769,7 @@ def gt_op( x: ibis_types.Value, y: ibis_types.Value, ): + x, y = _coerce_comparables(x, y) return x > y @@ -1762,6 +1779,7 @@ def ge_op( x: ibis_types.Value, y: ibis_types.Value, ): + x, y = _coerce_comparables(x, y) return x >= y diff --git a/bigframes/core/compile/sqlglot/compiler.py b/bigframes/core/compile/sqlglot/compiler.py index 3b7abd8463..606fe41b5e 100644 --- a/bigframes/core/compile/sqlglot/compiler.py +++ b/bigframes/core/compile/sqlglot/compiler.py @@ -211,6 +211,13 @@ def compile_projection( ) return child.project(projected_cols) + @_compile_node.register + def compile_filter( + self, node: nodes.FilterNode, child: ir.SQLGlotIR + ) -> ir.SQLGlotIR: + condition = scalar_compiler.compile_scalar_expression(node.predicate) + return child.filter(condition) + @_compile_node.register def compile_concat( self, node: nodes.ConcatNode, *children: ir.SQLGlotIR @@ -222,6 +229,14 @@ def compile_concat( uid_gen=self.uid_gen, ) + @_compile_node.register + def compile_explode( + self, node: nodes.ExplodeNode, child: ir.SQLGlotIR + ) -> ir.SQLGlotIR: + offsets_col = node.offsets_col.sql if (node.offsets_col is not None) else None + columns = tuple(ref.id.sql for ref in node.column_ids) + return child.explode(columns, offsets_col) + def _replace_unsupported_ops(node: nodes.BigFrameNode): node = nodes.bottom_up(node, rewrite.rewrite_slice) diff --git a/bigframes/core/compile/sqlglot/expressions/__init__.py b/bigframes/core/compile/sqlglot/expressions/__init__.py new file mode 100644 index 0000000000..0a2669d7a2 --- /dev/null +++ b/bigframes/core/compile/sqlglot/expressions/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/bigframes/core/compile/sqlglot/expressions/binary_compiler.py b/bigframes/core/compile/sqlglot/expressions/binary_compiler.py new file mode 100644 index 0000000000..ec75d3a3a4 --- /dev/null +++ b/bigframes/core/compile/sqlglot/expressions/binary_compiler.py @@ -0,0 +1,44 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import sqlglot.expressions as sge + +from bigframes import dtypes +from bigframes import operations as ops +from bigframes.core.compile.sqlglot.expressions.op_registration import OpRegistration +from bigframes.core.compile.sqlglot.expressions.typed_expr import TypedExpr + +BINARY_OP_REGISTRATION = OpRegistration() + + +def compile(op: ops.BinaryOp, left: TypedExpr, right: TypedExpr) -> sge.Expression: + return BINARY_OP_REGISTRATION[op](op, left, right) + + +# TODO: add parenthesize for operators +@BINARY_OP_REGISTRATION.register(ops.add_op) +def _(op, left: TypedExpr, right: TypedExpr) -> sge.Expression: + if left.dtype == dtypes.STRING_DTYPE and right.dtype == dtypes.STRING_DTYPE: + # String addition + return sge.Concat(expressions=[left.expr, right.expr]) + + # Numerical addition + return sge.Add(this=left.expr, expression=right.expr) + + +@BINARY_OP_REGISTRATION.register(ops.ge_op) +def _(op, left: TypedExpr, right: TypedExpr) -> sge.Expression: + return sge.GTE(this=left.expr, expression=right.expr) diff --git a/bigframes/core/compile/sqlglot/expressions/nary_compiler.py b/bigframes/core/compile/sqlglot/expressions/nary_compiler.py new file mode 100644 index 0000000000..12f68613d7 --- /dev/null +++ b/bigframes/core/compile/sqlglot/expressions/nary_compiler.py @@ -0,0 +1,27 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import sqlglot.expressions as sge + +from bigframes import operations as ops +from bigframes.core.compile.sqlglot.expressions.op_registration import OpRegistration +from bigframes.core.compile.sqlglot.expressions.typed_expr import TypedExpr + +NARY_OP_REGISTRATION = OpRegistration() + + +def compile(op: ops.NaryOp, *args: TypedExpr) -> sge.Expression: + return NARY_OP_REGISTRATION[op](op, *args) diff --git a/bigframes/core/compile/sqlglot/expressions/op_registration.py b/bigframes/core/compile/sqlglot/expressions/op_registration.py new file mode 100644 index 0000000000..e30b58a6d2 --- /dev/null +++ b/bigframes/core/compile/sqlglot/expressions/op_registration.py @@ -0,0 +1,54 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import typing + +from sqlglot import expressions as sge + +from bigframes import operations as ops + +# We should've been more specific about input types. Unfortunately, +# MyPy doesn't support more rigorous checks. +CompilationFunc = typing.Callable[..., sge.Expression] + + +class OpRegistration: + def __init__(self) -> None: + self._registered_ops: dict[str, CompilationFunc] = {} + + def register( + self, op: ops.ScalarOp | type[ops.ScalarOp] + ) -> typing.Callable[[CompilationFunc], CompilationFunc]: + def decorator(item: CompilationFunc): + def arg_checker(*args, **kwargs): + if not isinstance(args[0], ops.ScalarOp): + raise ValueError( + f"The first parameter must be an operator. Got {type(args[0])}" + ) + return item(*args, **kwargs) + + key = typing.cast(str, op.name) + if key in self._registered_ops: + raise ValueError(f"{key} is already registered") + self._registered_ops[key] = item + return arg_checker + + return decorator + + def __getitem__(self, key: str | ops.ScalarOp) -> CompilationFunc: + if isinstance(key, ops.ScalarOp): + return self._registered_ops[key.name] + return self._registered_ops[key] diff --git a/bigframes/core/compile/sqlglot/expressions/ternary_compiler.py b/bigframes/core/compile/sqlglot/expressions/ternary_compiler.py new file mode 100644 index 0000000000..9b00771f7d --- /dev/null +++ b/bigframes/core/compile/sqlglot/expressions/ternary_compiler.py @@ -0,0 +1,29 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import sqlglot.expressions as sge + +from bigframes import operations as ops +from bigframes.core.compile.sqlglot.expressions.op_registration import OpRegistration +from bigframes.core.compile.sqlglot.expressions.typed_expr import TypedExpr + +TERNATRY_OP_REGISTRATION = OpRegistration() + + +def compile( + op: ops.TernaryOp, expr1: TypedExpr, expr2: TypedExpr, expr3: TypedExpr +) -> sge.Expression: + return TERNATRY_OP_REGISTRATION[op](op, expr1, expr2, expr3) diff --git a/bigframes/core/compile/sqlglot/expressions/typed_expr.py b/bigframes/core/compile/sqlglot/expressions/typed_expr.py new file mode 100644 index 0000000000..e693dd94a2 --- /dev/null +++ b/bigframes/core/compile/sqlglot/expressions/typed_expr.py @@ -0,0 +1,27 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import dataclasses + +import sqlglot.expressions as sge + +from bigframes import dtypes + + +@dataclasses.dataclass(frozen=True) +class TypedExpr: + """SQLGlot expression with type.""" + + expr: sge.Expression + dtype: dtypes.ExpressionType diff --git a/bigframes/core/compile/sqlglot/expressions/unary_compiler.py b/bigframes/core/compile/sqlglot/expressions/unary_compiler.py new file mode 100644 index 0000000000..716917b455 --- /dev/null +++ b/bigframes/core/compile/sqlglot/expressions/unary_compiler.py @@ -0,0 +1,72 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import typing + +import sqlglot +import sqlglot.expressions as sge + +from bigframes import operations as ops +from bigframes.core.compile.sqlglot.expressions.op_registration import OpRegistration +from bigframes.core.compile.sqlglot.expressions.typed_expr import TypedExpr + +UNARY_OP_REGISTRATION = OpRegistration() + + +def compile(op: ops.UnaryOp, expr: TypedExpr) -> sge.Expression: + return UNARY_OP_REGISTRATION[op](op, expr) + + +@UNARY_OP_REGISTRATION.register(ops.ArrayToStringOp) +def _(op: ops.ArrayToStringOp, expr: TypedExpr) -> sge.Expression: + return sge.ArrayToString(this=expr.expr, expression=f"'{op.delimiter}'") + + +@UNARY_OP_REGISTRATION.register(ops.ArrayIndexOp) +def _(op: ops.ArrayIndexOp, expr: TypedExpr) -> sge.Expression: + return sge.Bracket( + this=expr.expr, + expressions=[sge.Literal.number(op.index)], + safe=True, + offset=False, + ) + + +@UNARY_OP_REGISTRATION.register(ops.ArraySliceOp) +def _(op: ops.ArraySliceOp, expr: TypedExpr) -> sge.Expression: + slice_idx = sqlglot.to_identifier("slice_idx") + + conditions: typing.List[sge.Predicate] = [slice_idx >= op.start] + + if op.stop is not None: + conditions.append(slice_idx < op.stop) + + # local name for each element in the array + el = sqlglot.to_identifier("el") + + selected_elements = ( + sge.select(el) + .from_( + sge.Unnest( + expressions=[expr.expr], + alias=sge.TableAlias(columns=[el]), + offset=slice_idx, + ) + ) + .where(*conditions) + ) + + return sge.array(selected_elements) diff --git a/bigframes/core/compile/sqlglot/scalar_compiler.py b/bigframes/core/compile/sqlglot/scalar_compiler.py index 00ec892620..f553518300 100644 --- a/bigframes/core/compile/sqlglot/scalar_compiler.py +++ b/bigframes/core/compile/sqlglot/scalar_compiler.py @@ -13,25 +13,22 @@ # limitations under the License. from __future__ import annotations -import dataclasses import functools import sqlglot.expressions as sge -from bigframes import dtypes from bigframes.core import expression +from bigframes.core.compile.sqlglot.expressions import ( + binary_compiler, + nary_compiler, + ternary_compiler, + typed_expr, + unary_compiler, +) import bigframes.core.compile.sqlglot.sqlglot_ir as ir import bigframes.operations as ops -@dataclasses.dataclass(frozen=True) -class TypedExpr: - """SQLGlot expression with type.""" - - expr: sge.Expression - dtype: dtypes.ExpressionType - - @functools.singledispatch def compile_scalar_expression( expression: expression.Expression, @@ -63,39 +60,21 @@ def compile_constant_expression( def compile_op_expression(expr: expression.OpExpression) -> sge.Expression: # Non-recursively compiles the children scalar expressions. args = tuple( - TypedExpr(compile_scalar_expression(input), input.output_type) + typed_expr.TypedExpr(compile_scalar_expression(input), input.output_type) for input in expr.inputs ) op = expr.op - op_name = expr.op.__class__.__name__ - method_name = f"compile_{op_name.lower()}" - method = globals().get(method_name, None) - if method is None: - raise ValueError( - f"Compilation method '{method_name}' not found for operator '{op_name}'." - ) - if isinstance(op, ops.UnaryOp): - return method(op, args[0]) + return unary_compiler.compile(op, args[0]) elif isinstance(op, ops.BinaryOp): - return method(op, args[0], args[1]) + return binary_compiler.compile(op, args[0], args[1]) elif isinstance(op, ops.TernaryOp): - return method(op, args[0], args[1], args[2]) + return ternary_compiler.compile(op, args[0], args[1], args[2]) elif isinstance(op, ops.NaryOp): - return method(op, *args) + return nary_compiler.compile(op, *args) else: raise TypeError( - f"Operator '{op_name}' has an unrecognized arity or type " + f"Operator '{op.name}' has an unrecognized arity or type " "and cannot be compiled." ) - - -# TODO: add parenthesize for operators -def compile_addop(op: ops.AddOp, left: TypedExpr, right: TypedExpr) -> sge.Expression: - if left.dtype == dtypes.STRING_DTYPE and right.dtype == dtypes.STRING_DTYPE: - # String addition - return sge.Concat(expressions=[left.expr, right.expr]) - - # Numerical addition - return sge.Add(this=left.expr, expression=right.expr) diff --git a/bigframes/core/compile/sqlglot/sqlglot_ir.py b/bigframes/core/compile/sqlglot/sqlglot_ir.py index 47dab209d0..6bc2b55162 100644 --- a/bigframes/core/compile/sqlglot/sqlglot_ir.py +++ b/bigframes/core/compile/sqlglot/sqlglot_ir.py @@ -182,7 +182,7 @@ def from_union( selections = [ sge.Alias( - this=expr.alias_or_name, + this=sge.to_identifier(expr.alias_or_name, quoted=cls.quoted), alias=sge.to_identifier(output_id, quoted=cls.quoted), ) for expr, output_id in zip(select_expr.expressions, output_ids) @@ -250,6 +250,16 @@ def project( new_expr = self._encapsulate_as_cte().select(*projected_cols_expr, append=True) return SQLGlotIR(expr=new_expr, uid_gen=self.uid_gen) + def filter( + self, + condition: sge.Expression, + ) -> SQLGlotIR: + """Filters the query with the given condition.""" + new_expr = self._encapsulate_as_cte() + return SQLGlotIR( + expr=new_expr.where(condition, append=False), uid_gen=self.uid_gen + ) + def insert( self, destination: bigquery.TableReference, @@ -280,6 +290,96 @@ def replace( ).sql(dialect=self.dialect, pretty=self.pretty) return f"{merge_str}\n{whens_str}" + def explode( + self, + column_names: tuple[str, ...], + offsets_col: typing.Optional[str], + ) -> SQLGlotIR: + num_columns = len(list(column_names)) + assert num_columns > 0, "At least one column must be provided for explode." + if num_columns == 1: + return self._explode_single_column(column_names[0], offsets_col) + else: + return self._explode_multiple_columns(column_names, offsets_col) + + def _explode_single_column( + self, column_name: str, offsets_col: typing.Optional[str] + ) -> SQLGlotIR: + """Helper method to handle the case of exploding a single column.""" + + offset = ( + sge.to_identifier(offsets_col, quoted=self.quoted) if offsets_col else None + ) + column = sge.to_identifier(column_name, quoted=self.quoted) + unnested_column_alias = sge.to_identifier( + next(self.uid_gen.get_uid_stream("bfcol_")), quoted=self.quoted + ) + unnest_expr = sge.Unnest( + expressions=[column], + alias=sge.TableAlias(columns=[unnested_column_alias]), + offset=offset, + ) + selection = sge.Star(replace=[unnested_column_alias.as_(column)]) + # TODO: "CROSS" if not keep_empty else "LEFT" + # TODO: overlaps_with_parent to replace existing column. + new_expr = ( + self._encapsulate_as_cte() + .select(selection, append=False) + .join(unnest_expr, join_type="CROSS") + ) + return SQLGlotIR(expr=new_expr, uid_gen=self.uid_gen) + + def _explode_multiple_columns( + self, + column_names: tuple[str, ...], + offsets_col: typing.Optional[str], + ) -> SQLGlotIR: + """Helper method to handle the case of exploding multiple columns.""" + offset = ( + sge.to_identifier(offsets_col, quoted=self.quoted) if offsets_col else None + ) + columns = [ + sge.to_identifier(column_name, quoted=self.quoted) + for column_name in column_names + ] + + # If there are multiple columns, we need to unnest by zipping the arrays: + # https://cloud.google.com/bigquery/docs/arrays#zipping_arrays + column_lengths = [ + sge.func("ARRAY_LENGTH", sge.to_identifier(column, quoted=self.quoted)) - 1 + for column in columns + ] + generate_array = sge.func( + "GENERATE_ARRAY", + sge.convert(0), + sge.func("LEAST", *column_lengths), + ) + unnested_offset_alias = sge.to_identifier( + next(self.uid_gen.get_uid_stream("bfcol_")), quoted=self.quoted + ) + unnest_expr = sge.Unnest( + expressions=[generate_array], + alias=sge.TableAlias(columns=[unnested_offset_alias]), + offset=offset, + ) + selection = sge.Star( + replace=[ + sge.Bracket( + this=column, + expressions=[unnested_offset_alias], + safe=True, + offset=False, + ).as_(column) + for column in columns + ] + ) + new_expr = ( + self._encapsulate_as_cte() + .select(selection, append=False) + .join(unnest_expr, join_type="CROSS") + ) + return SQLGlotIR(expr=new_expr, uid_gen=self.uid_gen) + def _encapsulate_as_cte( self, ) -> sge.Select: diff --git a/bigframes/core/expression.py b/bigframes/core/expression.py index 238b588fea..40ba70c555 100644 --- a/bigframes/core/expression.py +++ b/bigframes/core/expression.py @@ -19,7 +19,7 @@ import functools import itertools import typing -from typing import Generator, Mapping, TypeVar, Union +from typing import Callable, Generator, Mapping, TypeVar, Union import pandas as pd @@ -249,6 +249,10 @@ def is_identity(self) -> bool: """True for identity operation that does not transform input.""" return False + @abc.abstractmethod + def transform_children(self, t: Callable[[Expression], Expression]) -> Expression: + ... + def walk(self) -> Generator[Expression, None, None]: yield self for child in self.children: @@ -311,6 +315,9 @@ def __eq__(self, other): return self.value == other.value and self.dtype == other.dtype + def transform_children(self, t: Callable[[Expression], Expression]) -> Expression: + return self + @dataclasses.dataclass(frozen=True) class UnboundVariableExpression(Expression): @@ -362,6 +369,9 @@ def is_bijective(self) -> bool: def is_identity(self) -> bool: return True + def transform_children(self, t: Callable[[Expression], Expression]) -> Expression: + return self + @dataclasses.dataclass(frozen=True) class DerefOp(Expression): @@ -414,6 +424,9 @@ def is_bijective(self) -> bool: def is_identity(self) -> bool: return True + def transform_children(self, t: Callable[[Expression], Expression]) -> Expression: + return self + @dataclasses.dataclass(frozen=True) class SchemaFieldRefExpression(Expression): @@ -463,12 +476,15 @@ def is_bijective(self) -> bool: def is_identity(self) -> bool: return True + def transform_children(self, t: Callable[[Expression], Expression]) -> Expression: + return self + @dataclasses.dataclass(frozen=True) class OpExpression(Expression): """An expression representing a scalar operation applied to 1 or more argument sub-expressions.""" - op: bigframes.operations.RowOp + op: bigframes.operations.ScalarOp inputs: typing.Tuple[Expression, ...] @property @@ -553,6 +569,12 @@ def deterministic(self) -> bool: all(input.deterministic for input in self.inputs) and self.op.deterministic ) + def transform_children(self, t: Callable[[Expression], Expression]) -> Expression: + new_inputs = tuple(t(input) for input in self.inputs) + if new_inputs != self.inputs: + return dataclasses.replace(self, inputs=new_inputs) + return self + def bind_schema_fields( expr: Expression, field_by_id: Mapping[ids.ColumnId, field.Field] diff --git a/bigframes/core/global_session.py b/bigframes/core/global_session.py index 8732b55990..4698e4c4c5 100644 --- a/bigframes/core/global_session.py +++ b/bigframes/core/global_session.py @@ -110,8 +110,8 @@ def get_global_session(): _T = TypeVar("_T") -def with_default_session(func: Callable[..., _T], *args, **kwargs) -> _T: - return func(get_global_session(), *args, **kwargs) +def with_default_session(func_: Callable[..., _T], *args, **kwargs) -> _T: + return func_(get_global_session(), *args, **kwargs) class _GlobalSessionContext: diff --git a/bigframes/core/indexes/base.py b/bigframes/core/indexes/base.py index bc8b47d216..f653b8700b 100644 --- a/bigframes/core/indexes/base.py +++ b/bigframes/core/indexes/base.py @@ -174,6 +174,11 @@ def dtypes(self) -> pandas.Series: index=typing.cast(typing.Tuple, self._block.index.names), ) + def __setitem__(self, key, value) -> None: + """Index objects are immutable. Use Index constructor to create + modified Index.""" + raise TypeError("Index does not support mutable operations") + @property def size(self) -> int: return self.shape[0] diff --git a/bigframes/core/nodes.py b/bigframes/core/nodes.py index 38becd29df..205621fee2 100644 --- a/bigframes/core/nodes.py +++ b/bigframes/core/nodes.py @@ -1008,6 +1008,14 @@ def referenced_ids(self) -> COLUMN_SET: def _node_expressions(self): return (self.predicate,) + def transform_exprs( + self, fn: Callable[[ex.Expression], ex.Expression] + ) -> FilterNode: + return dataclasses.replace( + self, + predicate=fn(self.predicate), + ) + def remap_vars( self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId] ) -> FilterNode: @@ -1066,6 +1074,20 @@ def referenced_ids(self) -> COLUMN_SET: def _node_expressions(self): return tuple(map(lambda x: x.scalar_expression, self.by)) + def transform_exprs( + self, fn: Callable[[ex.Expression], ex.Expression] + ) -> OrderByNode: + new_by = cast( + tuple[OrderingExpression, ...], + tuple( + dataclasses.replace( + by_expr, scalar_expression=fn(by_expr.scalar_expression) + ) + for by_expr in self.by + ), + ) + return dataclasses.replace(self, by=new_by) + def remap_vars( self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId] ) -> OrderByNode: @@ -1078,14 +1100,9 @@ def remap_refs( itertools.chain.from_iterable(map(lambda x: x.referenced_columns, self.by)) ) ref_mapping = {id: ex.DerefOp(mappings[id]) for id in all_refs} - new_by = cast( - tuple[OrderingExpression, ...], - tuple( - by_expr.bind_refs(ref_mapping, allow_partial_bindings=True) - for by_expr in self.by - ), + return self.transform_exprs( + lambda ex: ex.bind_refs(ref_mapping, allow_partial_bindings=True) ) - return dataclasses.replace(self, by=new_by) @dataclasses.dataclass(frozen=True, eq=False) @@ -1293,6 +1310,12 @@ def _node_expressions(self): def additive_base(self) -> BigFrameNode: return self.child + def transform_exprs( + self, fn: Callable[[ex.Expression], ex.Expression] + ) -> ProjectionNode: + new_fields = tuple((fn(ex), id) for ex, id in self.assignments) + return dataclasses.replace(self, assignments=new_fields) + def replace_additive_base(self, node: BigFrameNode) -> ProjectionNode: return dataclasses.replace(self, child=node) diff --git a/bigframes/core/rewrite/op_lowering.py b/bigframes/core/rewrite/op_lowering.py new file mode 100644 index 0000000000..a64a4cc8c4 --- /dev/null +++ b/bigframes/core/rewrite/op_lowering.py @@ -0,0 +1,57 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +import abc +from typing import Sequence + +from bigframes.core import bigframe_node, expression, nodes +import bigframes.operations as ops + + +class OpLoweringRule(abc.ABC): + @property + @abc.abstractmethod + def op(self) -> type[ops.ScalarOp]: + ... + + @abc.abstractmethod + def lower(self, expr: expression.OpExpression) -> expression.Expression: + ... + + +def lower_ops( + root: bigframe_node.BigFrameNode, rules: Sequence[OpLoweringRule] +) -> bigframe_node.BigFrameNode: + rules_by_op = {rule.op: rule for rule in rules} + + def lower_expr(expr: expression.Expression): + def lower_expr_step(expr: expression.Expression) -> expression.Expression: + if isinstance(expr, expression.OpExpression): + maybe_rule = rules_by_op.get(expr.op.__class__) + if maybe_rule: + return maybe_rule.lower(expr) + return expr + + return lower_expr_step(expr.transform_children(lower_expr_step)) + + def lower_node(node: bigframe_node.BigFrameNode) -> bigframe_node.BigFrameNode: + if isinstance( + node, (nodes.ProjectionNode, nodes.FilterNode, nodes.OrderByNode) + ): + return node.transform_exprs(lower_expr) + else: + return node + + return root.bottom_up(lower_node) diff --git a/bigframes/core/tools/bigquery_schema.py b/bigframes/core/tools/bigquery_schema.py index 227a69e0f7..eef7364a1b 100644 --- a/bigframes/core/tools/bigquery_schema.py +++ b/bigframes/core/tools/bigquery_schema.py @@ -18,6 +18,12 @@ import google.cloud.bigquery +_LEGACY_TO_GOOGLESQL_TYPES = { + "BOOLEAN": "BOOL", + "INTEGER": "INT64", + "FLOAT": "FLOAT64", +} + def _type_to_sql(field: google.cloud.bigquery.SchemaField): """Turn the type information of the field into SQL. @@ -26,7 +32,12 @@ def _type_to_sql(field: google.cloud.bigquery.SchemaField): """ if field.field_type.casefold() in ("record", "struct"): return _to_struct(field.fields) - return field.field_type + + # Map from legacy SQL names (the ones used in the BigQuery schema API) to + # the GoogleSQL types. Importantly, FLOAT is from legacy SQL, but not valid + # in GoogleSQL. See internal issue b/428190014. + type_ = _LEGACY_TO_GOOGLESQL_TYPES.get(field.field_type.upper(), field.field_type) + return type_ def _field_to_sql(field: google.cloud.bigquery.SchemaField): diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 495e242f43..1ca5b8b035 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -3004,14 +3004,44 @@ def agg( if utils.is_dict_like(func): # Must check dict-like first because dictionaries are list-like # according to Pandas. - agg_cols = [] - for col_label, agg_func in func.items(): - agg_cols.append(self[col_label].agg(agg_func)) - - from bigframes.core.reshape import api as reshape - - return reshape.concat(agg_cols, axis=1) + aggs = [] + labels = [] + funcnames = [] + for col_label, agg_func in func.items(): + agg_func_list = agg_func if utils.is_list_like(agg_func) else [agg_func] + col_id = self._block.resolve_label_exact(col_label) + if col_id is None: + raise KeyError(f"Column {col_label} does not exist") + for agg_func in agg_func_list: + agg_op = agg_ops.lookup_agg_func(typing.cast(str, agg_func)) + agg_expr = ( + ex.UnaryAggregation(agg_op, ex.deref(col_id)) + if isinstance(agg_op, agg_ops.UnaryAggregateOp) + else ex.NullaryAggregation(agg_op) + ) + aggs.append(agg_expr) + labels.append(col_label) + funcnames.append(agg_func) + + # if any list in dict values, format output differently + if any(utils.is_list_like(v) for v in func.values()): + new_index, _ = self.columns.reindex(labels) + new_index = utils.combine_indices(new_index, pandas.Index(funcnames)) + agg_block, _ = self._block.aggregate( + aggregations=aggs, column_labels=new_index + ) + return DataFrame(agg_block).stack().droplevel(0, axis="index") + else: + new_index, _ = self.columns.reindex(labels) + agg_block, _ = self._block.aggregate( + aggregations=aggs, column_labels=new_index + ) + return bigframes.series.Series( + agg_block.transpose( + single_row_mode=True, original_row_index=pandas.Index([None]) + ) + ) elif utils.is_list_like(func): aggregations = [agg_ops.lookup_agg_func(f) for f in func] @@ -3027,7 +3057,7 @@ def agg( ) ) - else: + else: # function name string return bigframes.series.Series( self._block.aggregate_all_and_stack( agg_ops.lookup_agg_func(typing.cast(str, func)) diff --git a/bigframes/functions/_function_session.py b/bigframes/functions/_function_session.py index 9e7555431a..a7910127e4 100644 --- a/bigframes/functions/_function_session.py +++ b/bigframes/functions/_function_session.py @@ -668,6 +668,30 @@ def wrapper(func): return wrapper + def deploy_remote_function( + self, + func, + **kwargs, + ): + """Orchestrates the creation of a BigQuery remote function that deploys immediately. + + This method ensures that the remote function is created and available for + use in BigQuery as soon as this call is made. + + Args: + kwargs: + All arguments are passed directly to + :meth:`~bigframes.session.Session.remote_function`. Please see + its docstring for parameter details. + + Returns: + A wrapped remote function, usable in + :meth:`~bigframes.series.Series.apply`. + """ + # TODO(tswast): If we update remote_function to defer deployment, update + # this method to deploy immediately. + return self.remote_function(**kwargs)(func) + def udf( self, input_types: Union[None, type, Sequence[type]] = None, @@ -866,6 +890,32 @@ def wrapper(func): return wrapper + def deploy_udf( + self, + func, + **kwargs, + ): + """Orchestrates the creation of a BigQuery UDF that deploys immediately. + + This method ensures that the UDF is created and available for + use in BigQuery as soon as this call is made. + + Args: + func: + Function to deploy. + kwargs: + All arguments are passed directly to + :meth:`~bigframes.session.Session.udf`. Please see + its docstring for parameter details. + + Returns: + A wrapped Python user defined function, usable in + :meth:`~bigframes.series.Series.apply`. + """ + # TODO(tswast): If we update udf to defer deployment, update this method + # to deploy immediately. + return self.udf(**kwargs)(func) + def _convert_row_processor_sig( signature: inspect.Signature, diff --git a/bigframes/operations/ai.py b/bigframes/operations/ai.py index 10c842c64c..8c7628059a 100644 --- a/bigframes/operations/ai.py +++ b/bigframes/operations/ai.py @@ -19,8 +19,6 @@ from typing import Dict, Iterable, List, Optional, Sequence, Union import warnings -import numpy as np - from bigframes import dtypes, exceptions, options from bigframes.core import guid, log_adapter @@ -586,207 +584,6 @@ def search( return typing.cast(bigframes.dataframe.DataFrame, search_result) - def top_k( - self, - instruction: str, - model, - k: int = 10, - ground_with_google_search: bool = False, - ): - """ - Ranks each tuple and returns the k best according to the instruction. - - This method employs a quick select algorithm to efficiently compare the pivot - with all other items. By leveraging an LLM (Large Language Model), it then - identifies the top 'k' best answers from these comparisons. - - **Examples:** - - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> bpd.options.experiments.ai_operators = True - >>> bpd.options.compute.ai_ops_confirmation_threshold = 25 - - >>> import bigframes.ml.llm as llm - >>> model = llm.GeminiTextGenerator(model_name="gemini-2.0-flash-001") - - >>> df = bpd.DataFrame( - ... { - ... "Animals": ["Dog", "Bird", "Cat", "Horse"], - ... "Sounds": ["Woof", "Chirp", "Meow", "Neigh"], - ... }) - >>> df.ai.top_k("{Animals} are more popular as pets", model=model, k=2) - Animals Sounds - 0 Dog Woof - 2 Cat Meow - - [2 rows x 2 columns] - - Args: - instruction (str): - An instruction on how to map the data. This value must contain - column references by name enclosed in braces. - For example, to reference a column named "Animals", use "{Animals}" in the - instruction, like: "{Animals} are more popular as pets" - - model (bigframes.ml.llm.GeminiTextGenerator): - A GeminiTextGenerator provided by the Bigframes ML package. - - k (int, default 10): - The number of rows to return. - - ground_with_google_search (bool, default False): - Enables Grounding with Google Search for the GeminiTextGenerator model. - When set to True, the model incorporates relevant information from Google - Search results into its responses, enhancing their accuracy and factualness. - Note: Using this feature may impact billing costs. Refer to the pricing - page for details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models - The default is `False`. - - Returns: - bigframes.dataframe.DataFrame: A new DataFrame with the top k rows. - - Raises: - NotImplementedError: when the AI operator experiment is off. - ValueError: when the instruction refers to a non-existing column, or when no - columns are referred to. - """ - if not options.experiments.ai_operators: - raise NotImplementedError() - - import bigframes.dataframe - import bigframes.series - - self._validate_model(model) - columns = self._parse_columns(instruction) - for column in columns: - if column not in self._df.columns: - raise ValueError(f"Column {column} not found.") - if len(columns) > 1: - raise NotImplementedError("AI top K are limited to a single column.") - - if ground_with_google_search: - msg = exceptions.format_message( - "Enables Grounding with Google Search may impact billing cost. See pricing " - "details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models" - ) - warnings.warn(msg, category=UserWarning) - - work_estimate = int(len(self._df) * (len(self._df) - 1) / 2) - self._confirm_operation(work_estimate) - - df: bigframes.dataframe.DataFrame = self._df[columns].copy() - column = columns[0] - if df[column].dtype != dtypes.STRING_DTYPE: - df[column] = df[column].astype(dtypes.STRING_DTYPE) - - # `index` is reserved for the `reset_index` below. - if column == "index": - raise ValueError( - "Column name 'index' is reserved. Please choose a different name." - ) - - if k < 1: - raise ValueError("k must be an integer greater than or equal to 1.") - - user_instruction = self._format_instruction(instruction, columns) - - n = df.shape[0] - if k >= n: - return df - - # Create a unique index and duplicate it as the "index" column. This workaround - # is needed for the select search algorithm due to unimplemented bigFrame methods. - df = df.reset_index().rename(columns={"index": "old_index"}).reset_index() - - # Initialize a status column to track the selection status of each item. - # - None: Unknown/not yet processed - # - 1.0: Selected as part of the top-k items - # - -1.0: Excluded from the top-k items - status_column = guid.generate_guid("status") - df[status_column] = bigframes.series.Series( - None, dtype=dtypes.FLOAT_DTYPE, session=df._session - ) - - num_selected = 0 - while num_selected < k: - df, num_new_selected = self._topk_partition( - df, - column, - status_column, - user_instruction, - model, - k - num_selected, - ground_with_google_search, - ) - num_selected += num_new_selected - - result_df: bigframes.dataframe.DataFrame = self._df.copy() - return result_df[df.set_index("old_index")[status_column] > 0.0] - - @staticmethod - def _topk_partition( - df, - column: str, - status_column: str, - user_instruction: str, - model, - k: int, - ground_with_google_search: bool, - ): - output_instruction = ( - "Given a question and two documents, choose the document that best answers " - "the question. Respond with 'Document 1' or 'Document 2'. You must choose " - "one, even if neither is ideal. " - ) - - # Random pivot selection for improved average quickselect performance. - pending_df = df[df[status_column].isna()] - pivot_iloc = np.random.randint(0, pending_df.shape[0]) - pivot_index = pending_df.iloc[pivot_iloc]["index"] - pivot_df = pending_df[pending_df["index"] == pivot_index] - - # Build a prompt to compare the pivot item's relevance to other pending items. - prompt_s = pending_df[pending_df["index"] != pivot_index][column] - prompt_s = ( - f"{output_instruction}\n\nQuestion: {user_instruction}\n" - + f"\nDocument 1: {column} " - + pivot_df.iloc[0][column] - + f"\nDocument 2: {column} " - + prompt_s # type:ignore - ) - - import bigframes.dataframe - - predict_df = typing.cast( - bigframes.dataframe.DataFrame, - model.predict( - prompt_s, - temperature=0.0, - ground_with_google_search=ground_with_google_search, - ), - ) - - marks = predict_df["ml_generate_text_llm_result"].str.contains("2") - more_relavant: bigframes.dataframe.DataFrame = df[marks] - less_relavent: bigframes.dataframe.DataFrame = df[~marks] - - num_more_relavant = more_relavant.shape[0] - if k < num_more_relavant: - less_relavent[status_column] = -1.0 - pivot_df[status_column] = -1.0 - df = df.combine_first(less_relavent).combine_first(pivot_df) - return df, 0 - else: # k >= num_more_relavant - more_relavant[status_column] = 1.0 - df = df.combine_first(more_relavant) - if k >= num_more_relavant + 1: - pivot_df[status_column] = 1.0 - df = df.combine_first(pivot_df) - return df, num_more_relavant + 1 - else: - return df, num_more_relavant - def sim_join( self, other, diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index a9d1c31865..f163d25757 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -40,6 +40,7 @@ from bigframes.pandas.io.api import ( _read_gbq_colab, from_glob_path, + read_arrow, read_csv, read_gbq, read_gbq_function, @@ -117,6 +118,22 @@ def remote_function( remote_function.__doc__ = inspect.getdoc(bigframes.session.Session.remote_function) +def deploy_remote_function( + func, + **kwargs, +): + return global_session.with_default_session( + bigframes.session.Session.deploy_remote_function, + func=func, + **kwargs, + ) + + +deploy_remote_function.__doc__ = inspect.getdoc( + bigframes.session.Session.deploy_remote_function +) + + def udf( *, input_types: Union[None, type, Sequence[type]] = None, @@ -140,6 +157,20 @@ def udf( udf.__doc__ = inspect.getdoc(bigframes.session.Session.udf) +def deploy_udf( + func, + **kwargs, +): + return global_session.with_default_session( + bigframes.session.Session.deploy_udf, + func=func, + **kwargs, + ) + + +deploy_udf.__doc__ = inspect.getdoc(bigframes.session.Session.deploy_udf) + + @typing.overload def to_datetime( arg: Union[ @@ -330,11 +361,14 @@ def reset_session(): clean_up_by_session_id, concat, cut, + deploy_remote_function, + deploy_udf, get_default_session_id, get_dummies, merge, qcut, read_csv, + read_arrow, read_gbq, _read_gbq_colab, read_gbq_function, diff --git a/bigframes/pandas/io/api.py b/bigframes/pandas/io/api.py index 608eaf5a82..65435bd902 100644 --- a/bigframes/pandas/io/api.py +++ b/bigframes/pandas/io/api.py @@ -44,6 +44,7 @@ ReadPickleBuffer, StorageOptions, ) +import pyarrow as pa import bigframes._config as config import bigframes.core.global_session as global_session @@ -72,6 +73,21 @@ # method and its arguments. +def read_arrow(pa_table: pa.Table) -> bigframes.dataframe.DataFrame: + """Load a PyArrow Table to a BigQuery DataFrames DataFrame. + + Args: + pa_table (pyarrow.Table): + PyArrow table to load data from. + + Returns: + bigframes.dataframe.DataFrame: + A new DataFrame representing the data from the PyArrow table. + """ + session = global_session.get_global_session() + return session.read_arrow(pa_table=pa_table) + + def read_csv( filepath_or_buffer: str | IO["bytes"], *, @@ -218,8 +234,27 @@ def read_gbq( read_gbq.__doc__ = inspect.getdoc(bigframes.session.Session.read_gbq) +def _run_read_gbq_colab_sessionless_dry_run( + query: str, + *, + pyformat_args: Dict[str, Any], +) -> pandas.Series: + """Run a dry_run without a session.""" + + query_formatted = bigframes.core.pyformat.pyformat( + query, + pyformat_args=pyformat_args, + dry_run=True, + ) + bqclient = _get_bqclient() + job = _dry_run(query_formatted, bqclient) + return dry_runs.get_query_stats_with_inferred_dtypes(job, (), ()) + + def _try_read_gbq_colab_sessionless_dry_run( - create_query: Callable[[], str], + query: str, + *, + pyformat_args: Dict[str, Any], ) -> Optional[pandas.Series]: """Run a dry_run without a session, only if the session hasn't yet started.""" @@ -230,10 +265,9 @@ def _try_read_gbq_colab_sessionless_dry_run( # to local data and not any BigQuery tables. with _default_location_lock: if not config.options.bigquery._session_started: - bqclient = _get_bqclient() - query = create_query() - job = _dry_run(query, bqclient) - return dry_runs.get_query_stats_with_inferred_dtypes(job, (), ()) + return _run_read_gbq_colab_sessionless_dry_run( + query, pyformat_args=pyformat_args + ) # Explicitly return None to indicate that we didn't run the dry run query. return None @@ -286,21 +320,13 @@ def _read_gbq_colab( if pyformat_args is None: pyformat_args = {} - # Delay formatting the query with the special "session-less" logic. This - # avoids doing unnecessary work if the session already has a location or has - # already started. - create_query = functools.partial( - bigframes.core.pyformat.pyformat, - query_or_table, - pyformat_args=pyformat_args, - dry_run=True, - ) - # Only try to set the global location if it's not a dry run. We don't want # to bind to a location too early. This is especially important if the query # only refers to local data and not any BigQuery tables. if dry_run: - result = _try_read_gbq_colab_sessionless_dry_run(create_query) + result = _try_read_gbq_colab_sessionless_dry_run( + query_or_table, pyformat_args=pyformat_args + ) if result is not None: return result @@ -309,6 +335,15 @@ def _read_gbq_colab( # started. That means we can safely call the "real" _read_gbq_colab, # which generates slightly nicer SQL. else: + # Delay formatting the query with the special "session-less" logic. This + # avoids doing unnecessary work if the session already has a location or has + # already started. + create_query = functools.partial( + bigframes.core.pyformat.pyformat, + query_or_table, + pyformat_args=pyformat_args, + dry_run=True, + ) _set_default_session_location_if_possible_deferred_query(create_query) return global_session.with_default_session( diff --git a/bigframes/series.py b/bigframes/series.py index ae6cd7b2ad..ebc2913f78 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -1598,6 +1598,10 @@ def __getattr__(self, key: str): else: raise AttributeError(key) + def __setitem__(self, key, value) -> None: + """Set item using direct assignment, delegating to .loc indexer.""" + self.loc[key] = value + def _apply_aggregation( self, op: agg_ops.UnaryAggregateOp | agg_ops.NullaryAggregateOp ) -> Any: diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index c06233bad3..9d113743cf 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -55,12 +55,14 @@ ReadPickleBuffer, StorageOptions, ) +import pyarrow as pa from bigframes import exceptions as bfe from bigframes import version import bigframes._config.bigquery_options as bigquery_options import bigframes.clients import bigframes.constants +import bigframes.core from bigframes.core import blocks, log_adapter, utils import bigframes.core.pyformat @@ -255,6 +257,7 @@ def __init__( storage_manager=self._temp_storage_manager, strictly_ordered=self._strictly_ordered, metrics=self._metrics, + enable_polars_execution=context.enable_polars_execution, ) def __del__(self): @@ -966,6 +969,22 @@ def _read_pandas_inline( local_block = blocks.Block.from_local(pandas_dataframe, self) return dataframe.DataFrame(local_block) + def read_arrow(self, pa_table: pa.Table) -> bigframes.dataframe.DataFrame: + """Load a PyArrow Table to a BigQuery DataFrames DataFrame. + + Args: + pa_table (pyarrow.Table): + PyArrow table to load data from. + + Returns: + bigframes.dataframe.DataFrame: + A new DataFrame representing the data from the PyArrow table. + """ + import bigframes.dataframe as dataframe + + local_block = blocks.Block.from_pyarrow(pa_table, self) + return dataframe.DataFrame(local_block) + def read_csv( self, filepath_or_buffer: str | IO["bytes"], @@ -1343,6 +1362,40 @@ def _check_file_size(self, filepath: str): "for large files to avoid loading the file into local memory." ) + def deploy_remote_function( + self, + func, + **kwargs, + ): + """Orchestrates the creation of a BigQuery remote function that deploys immediately. + + This method ensures that the remote function is created and available for + use in BigQuery as soon as this call is made. + + Args: + func: + Function to deploy. + kwargs: + All arguments are passed directly to + :meth:`~bigframes.session.Session.remote_function`. Please see + its docstring for parameter details. + + Returns: + A wrapped remote function, usable in + :meth:`~bigframes.series.Series.apply`. + """ + return self._function_session.deploy_remote_function( + func, + # Session-provided arguments. + session=self, + bigquery_client=self._clients_provider.bqclient, + bigquery_connection_client=self._clients_provider.bqconnectionclient, + cloud_functions_client=self._clients_provider.cloudfunctionsclient, + resource_manager_client=self._clients_provider.resourcemanagerclient, + # User-provided arguments. + **kwargs, + ) + def remote_function( self, # Make sure that the input/output types, and dataset can be used @@ -1565,9 +1618,15 @@ def remote_function( `bigframes_remote_function` - The bigquery remote function capable of calling into `bigframes_cloud_function`. """ return self._function_session.remote_function( + # Session-provided arguments. + session=self, + bigquery_client=self._clients_provider.bqclient, + bigquery_connection_client=self._clients_provider.bqconnectionclient, + cloud_functions_client=self._clients_provider.cloudfunctionsclient, + resource_manager_client=self._clients_provider.resourcemanagerclient, + # User-provided arguments. input_types=input_types, output_type=output_type, - session=self, dataset=dataset, bigquery_connection=bigquery_connection, reuse=reuse, @@ -1585,6 +1644,37 @@ def remote_function( cloud_build_service_account=cloud_build_service_account, ) + def deploy_udf( + self, + func, + **kwargs, + ): + """Orchestrates the creation of a BigQuery UDF that deploys immediately. + + This method ensures that the UDF is created and available for + use in BigQuery as soon as this call is made. + + Args: + func: + Function to deploy. + kwargs: + All arguments are passed directly to + :meth:`~bigframes.session.Session.udf`. Please see + its docstring for parameter details. + + Returns: + A wrapped Python user defined function, usable in + :meth:`~bigframes.series.Series.apply`. + """ + return self._function_session.deploy_udf( + func, + # Session-provided arguments. + session=self, + bigquery_client=self._clients_provider.bqclient, + # User-provided arguments. + **kwargs, + ) + def udf( self, *, @@ -1726,9 +1816,12 @@ def udf( deployed for the user defined code. """ return self._function_session.udf( + # Session-provided arguments. + session=self, + bigquery_client=self._clients_provider.bqclient, + # User-provided arguments. input_types=input_types, output_type=output_type, - session=self, dataset=dataset, bigquery_connection=bigquery_connection, name=name, diff --git a/bigframes/session/bq_caching_executor.py b/bigframes/session/bq_caching_executor.py index 9ad8da33a8..6750652bc2 100644 --- a/bigframes/session/bq_caching_executor.py +++ b/bigframes/session/bq_caching_executor.py @@ -41,7 +41,13 @@ import bigframes.core.tree_properties as tree_properties import bigframes.dtypes import bigframes.features -from bigframes.session import executor, loader, local_scan_executor, read_api_execution +from bigframes.session import ( + executor, + loader, + local_scan_executor, + read_api_execution, + semi_executor, +) import bigframes.session._io.bigquery as bq_io import bigframes.session.metrics import bigframes.session.planner @@ -147,6 +153,7 @@ def __init__( *, strictly_ordered: bool = True, metrics: Optional[bigframes.session.metrics.ExecutionMetrics] = None, + enable_polars_execution: bool = False, ): self.bqclient = bqclient self.storage_manager = storage_manager @@ -155,14 +162,21 @@ def __init__( self.metrics = metrics self.loader = loader self.bqstoragereadclient = bqstoragereadclient - # Simple left-to-right precedence for now - self._semi_executors = ( + self._enable_polars_execution = enable_polars_execution + self._semi_executors: Sequence[semi_executor.SemiExecutor] = ( read_api_execution.ReadApiSemiExecutor( bqstoragereadclient=bqstoragereadclient, project=self.bqclient.project, ), local_scan_executor.LocalScanExecutor(), ) + if enable_polars_execution: + from bigframes.session import polars_executor + + self._semi_executors = ( + *self._semi_executors, + polars_executor.PolarsExecutor(), + ) self._upload_lock = threading.Lock() def to_sql( @@ -637,8 +651,8 @@ def _execute_plan( """Just execute whatever plan as is, without further caching or decomposition.""" # First try to execute fast-paths if not output_spec.require_bq_table: - for semi_executor in self._semi_executors: - maybe_result = semi_executor.execute(plan, ordered=ordered, peek=peek) + for exec in self._semi_executors: + maybe_result = exec.execute(plan, ordered=ordered, peek=peek) if maybe_result: return maybe_result diff --git a/bigframes/session/polars_executor.py b/bigframes/session/polars_executor.py index 6e3e15499d..ec00e38606 100644 --- a/bigframes/session/polars_executor.py +++ b/bigframes/session/polars_executor.py @@ -20,6 +20,7 @@ from bigframes.core import array_value, bigframe_node, expression, local_data, nodes import bigframes.operations +from bigframes.operations import aggregations as agg_ops from bigframes.session import executor, semi_executor if TYPE_CHECKING: @@ -31,9 +32,21 @@ nodes.OrderByNode, nodes.ReversedNode, nodes.SelectionNode, + nodes.ProjectionNode, + nodes.SliceNode, + nodes.AggregateNode, ) -_COMPATIBLE_SCALAR_OPS = () +_COMPATIBLE_SCALAR_OPS = ( + bigframes.operations.eq_op, + bigframes.operations.eq_null_match_op, + bigframes.operations.ne_op, + bigframes.operations.gt_op, + bigframes.operations.lt_op, + bigframes.operations.ge_op, + bigframes.operations.le_op, +) +_COMPATIBLE_AGG_OPS = (agg_ops.SizeOp, agg_ops.SizeUnaryOp) def _get_expr_ops(expr: expression.Expression) -> set[bigframes.operations.ScalarOp]: @@ -47,7 +60,8 @@ def _is_node_polars_executable(node: nodes.BigFrameNode): return False for expr in node._node_expressions: if isinstance(expr, expression.Aggregation): - return False + if not type(expr.op) in _COMPATIBLE_AGG_OPS: + return False if isinstance(expr, expression.Expression): if not _get_expr_ops(expr).issubset(_COMPATIBLE_SCALAR_OPS): return False @@ -72,7 +86,7 @@ def execute( # Note: Ignoring ordered flag, as just executing totally ordered is fine. try: lazy_frame: pl.LazyFrame = self._compiler.compile( - array_value.ArrayValue(plan) + array_value.ArrayValue(plan).node ) except Exception: return None diff --git a/bigframes/testing/polars_session.py b/bigframes/testing/polars_session.py index 7b898a9f00..3710c40eae 100644 --- a/bigframes/testing/polars_session.py +++ b/bigframes/testing/polars_session.py @@ -41,7 +41,7 @@ def peek( """ A 'peek' efficiently accesses a small number of rows in the dataframe. """ - lazy_frame: polars.LazyFrame = self.compiler.compile(array_value) + lazy_frame: polars.LazyFrame = self.compiler.compile(array_value.node) pa_table = lazy_frame.collect().limit(n_rows).to_arrow() # Currently, pyarrow types might not quite be exactly the ones in the bigframes schema. # Nullability may be different, and might use large versions of list, string datatypes. @@ -64,7 +64,7 @@ def execute( """ Execute the ArrayValue, storing the result to a temporary session-owned table. """ - lazy_frame: polars.LazyFrame = self.compiler.compile(array_value) + lazy_frame: polars.LazyFrame = self.compiler.compile(array_value.node) pa_table = lazy_frame.collect().to_arrow() # Currently, pyarrow types might not quite be exactly the ones in the bigframes schema. # Nullability may be different, and might use large versions of list, string datatypes. diff --git a/bigframes/version.py b/bigframes/version.py index 5d2de2f97f..4f3c9a5124 100644 --- a/bigframes/version.py +++ b/bigframes/version.py @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "2.8.0" +__version__ = "2.9.0" # {x-release-please-start-date} -__release_date__ = "2025-06-23" +__release_date__ = "2025-06-30" # {x-release-please-end} diff --git a/notebooks/experimental/ai_operators.ipynb b/notebooks/experimental/ai_operators.ipynb index f830787801..977f7b9d74 100644 --- a/notebooks/experimental/ai_operators.ipynb +++ b/notebooks/experimental/ai_operators.ipynb @@ -35,12 +35,12 @@ "\n", " \n", " \n", - " \"Colab Run in Colab\n", + " \"Colab Run in Colab\n", " \n", " \n", " \n", " \n", - " \"GitHub\n", + " \"GitHub\n", " View on GitHub\n", " \n", " \n", @@ -1064,129 +1064,6 @@ "animals.ai.join(animals, \"{left.animal} generally weighs heavier than {right.animal}\", model=gemini_model)" ] }, - { - "cell_type": "markdown", - "metadata": { - "id": "kU7BsyTyiouX" - }, - "source": [ - "## AI Top K" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "s9QePXEoiouX" - }, - "source": [ - "AI Top K selects the top K values based on your instruction. Here is an example:" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": { - "id": "bMQqtyZ2iouX" - }, - "outputs": [], - "source": [ - "df = bpd.DataFrame({\"Animals\": [\"Corgi\", \"Orange Cat\", \"Parrot\", \"Tarantula\"]})" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "KiljGBSCiouX" - }, - "source": [ - "You want to find the top two most popular pets:" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 159 - }, - "id": "OZv5WUGIiouX", - "outputId": "ae1cee27-cc31-455e-c4ac-c0a9a5cf4ca5" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/sycai/src/python-bigquery-dataframes/bigframes/core/array_value.py:114: PreviewWarning: JSON column interpretation as a custom PyArrow extention in\n", - "`db_dtypes` is a preview feature and subject to change.\n", - " warnings.warn(msg, bfe.PreviewWarning)\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Animals
0Corgi
1Orange Cat
\n", - "

2 rows × 1 columns

\n", - "
[2 rows x 1 columns in total]" - ], - "text/plain": [ - " Animals\n", - "0 Corgi\n", - "1 Orange Cat\n", - "\n", - "[2 rows x 1 columns]" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.ai.top_k(\"{Animals} are more popular as pets\", model=gemini_model, k=2)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "dC8fyu3aiouX" - }, - "source": [ - "Under the hood, the AI top K operator performs pair-wise comparisons with LLM. The top K results are returned in the order of their indices instead of their ranks." - ] - }, { "cell_type": "markdown", "metadata": { diff --git a/notebooks/generative_ai/bq_dataframes_ai_forecast.ipynb b/notebooks/generative_ai/bq_dataframes_ai_forecast.ipynb index 05e75b37f0..5f6dede106 100644 --- a/notebooks/generative_ai/bq_dataframes_ai_forecast.ipynb +++ b/notebooks/generative_ai/bq_dataframes_ai_forecast.ipynb @@ -31,12 +31,12 @@ "\n", " \n", " \n", - " \"Colab Run in Colab\n", + " \"Colab Run in Colab\n", " \n", " \n", " \n", " \n", - " \"GitHub\n", + " \"GitHub\n", " View on GitHub\n", " \n", " \n", @@ -71,10 +71,14 @@ "source": [ "PROJECT = \"bigframes-dev\" # replace with your project\n", "\n", + "import warnings\n", + "warnings.filterwarnings(\"ignore\") \n", + "\n", "import bigframes\n", "# Setup project\n", "bigframes.options.bigquery.project = PROJECT\n", "bigframes.options.display.progress_bar = None\n", + "bigframes.options.bigquery.ordering_mode = \"partial\" # Optional: partial ordering mode can accelerate executions and save costs\n", "\n", "import bigframes.pandas as bpd" ] @@ -138,603 +142,603 @@ " \n", " \n", " 0\n", - " 1304531\n", - " 597\n", - " 2016-08-05 10:55:00+00:00\n", - " San Francisco Caltrain 2 (330 Townsend)\n", - " 69\n", - " 2016-08-05 11:05:00+00:00\n", - " Powell Street BART\n", - " 39\n", - " 214\n", - " 95121\n", - " ...\n", - " Subscriber\n", - " <NA>\n", - " <NA>\n", - " <NA>\n", - " <NA>\n", - " <NA>\n", + " 201802092135083596\n", + " 788\n", + " 2018-02-09 21:35:08+00:00\n", + " 10th Ave at E 15th St\n", + " 222\n", + " 2018-02-09 21:48:17+00:00\n", + " 10th Ave at E 15th St\n", + " 222\n", + " 3596\n", " <NA>\n", + " ...\n", " <NA>\n", - " None\n", - " None\n", + " 37.792714\n", + " -122.24878\n", + " 37.792714\n", + " -122.24878\n", + " 1984\n", + " Male\n", + " Yes\n", + " POINT (-122.24878 37.79271)\n", + " POINT (-122.24878 37.79271)\n", " \n", " \n", " 1\n", - " 184870\n", - " 403\n", - " 2014-02-14 14:50:00+00:00\n", - " Howard at 2nd\n", - " 63\n", - " 2014-02-14 14:56:00+00:00\n", - " Commercial at Montgomery\n", - " 45\n", - " 342\n", - " 94122\n", - " ...\n", - " Subscriber\n", - " <NA>\n", - " <NA>\n", - " <NA>\n", - " <NA>\n", + " 20171217135737144\n", + " 1072\n", + " 2017-12-17 13:57:37+00:00\n", + " 10th Ave at E 15th St\n", + " 222\n", + " 2017-12-17 14:15:30+00:00\n", + " 10th Ave at E 15th St\n", + " 222\n", + " 144\n", " <NA>\n", + " ...\n", " <NA>\n", + " 37.792714\n", + " -122.24878\n", + " 37.792714\n", + " -122.24878\n", + " 1984\n", + " Male\n", " <NA>\n", - " None\n", - " None\n", + " POINT (-122.24878 37.79271)\n", + " POINT (-122.24878 37.79271)\n", " \n", " \n", " 2\n", - " 20170702115603836\n", - " 16695\n", - " 2017-07-02 11:56:03+00:00\n", - " Union Square (Powell St at Post St)\n", - " 324\n", - " 2017-07-02 16:34:19+00:00\n", - " Union Square (Powell St at Post St)\n", - " 324\n", - " 836\n", + " 201803261642393539\n", + " 486\n", + " 2018-03-26 16:42:39+00:00\n", + " 10th St at Fallon St\n", + " 201\n", + " 2018-03-26 16:50:46+00:00\n", + " 10th Ave at E 15th St\n", + " 222\n", + " 3539\n", " <NA>\n", " ...\n", " <NA>\n", - " 37.7883\n", - " -122.408531\n", - " 37.7883\n", - " -122.408531\n", - " <NA>\n", - " <NA>\n", - " <NA>\n", - " POINT (-122.40853 37.7883)\n", - " POINT (-122.40853 37.7883)\n", + " 37.797673\n", + " -122.262997\n", + " 37.792714\n", + " -122.24878\n", + " 1984\n", + " Male\n", + " Yes\n", + " POINT (-122.263 37.79767)\n", + " POINT (-122.24878 37.79271)\n", " \n", " \n", " 3\n", - " 1066810\n", - " 953\n", - " 2016-01-21 08:24:00+00:00\n", - " Civic Center BART (7th at Market)\n", - " 72\n", - " 2016-01-21 08:40:00+00:00\n", - " Embarcadero at Sansome\n", - " 60\n", - " 212\n", - " 94103\n", - " ...\n", - " Subscriber\n", - " <NA>\n", - " <NA>\n", - " <NA>\n", - " <NA>\n", - " <NA>\n", + " 201802281657253632\n", + " 560\n", + " 2018-02-28 16:57:25+00:00\n", + " 10th St at Fallon St\n", + " 201\n", + " 2018-02-28 17:06:46+00:00\n", + " 10th Ave at E 15th St\n", + " 222\n", + " 3632\n", " <NA>\n", + " ...\n", " <NA>\n", - " None\n", - " None\n", + " 37.797673\n", + " -122.262997\n", + " 37.792714\n", + " -122.24878\n", + " 1984\n", + " Male\n", + " Yes\n", + " POINT (-122.263 37.79767)\n", + " POINT (-122.24878 37.79271)\n", " \n", " \n", " 4\n", - " 220481\n", - " 679\n", - " 2014-03-19 19:20:00+00:00\n", - " San Francisco Caltrain 2 (330 Townsend)\n", - " 69\n", - " 2014-03-19 19:31:00+00:00\n", - " Civic Center BART (7th at Market)\n", - " 72\n", - " 478\n", - " 94107\n", - " ...\n", - " Subscriber\n", - " <NA>\n", - " <NA>\n", + " 201708152357422491\n", + " 965\n", + " 2017-08-15 23:57:42+00:00\n", + " 10th St at Fallon St\n", + " 201\n", + " 2017-08-16 00:13:48+00:00\n", + " 10th Ave at E 15th St\n", + " 222\n", + " 2491\n", " <NA>\n", + " ...\n", " <NA>\n", + " 37.797673\n", + " -122.262997\n", + " 37.792714\n", + " -122.24878\n", " <NA>\n", " <NA>\n", " <NA>\n", - " None\n", - " None\n", + " POINT (-122.263 37.79767)\n", + " POINT (-122.24878 37.79271)\n", " \n", " \n", " 5\n", - " 738474\n", - " 358\n", - " 2015-04-23 16:45:00+00:00\n", - " 2nd at Folsom\n", - " 62\n", - " 2015-04-23 16:51:00+00:00\n", - " Steuart at Market\n", - " 74\n", - " 443\n", - " 94105\n", - " ...\n", - " Subscriber\n", - " <NA>\n", - " <NA>\n", - " <NA>\n", - " <NA>\n", - " <NA>\n", + " 201801161800473291\n", + " 489\n", + " 2018-01-16 18:00:47+00:00\n", + " 10th St at Fallon St\n", + " 201\n", + " 2018-01-16 18:08:56+00:00\n", + " 10th Ave at E 15th St\n", + " 222\n", + " 3291\n", " <NA>\n", + " ...\n", " <NA>\n", - " None\n", - " None\n", + " 37.797673\n", + " -122.262997\n", + " 37.792714\n", + " -122.24878\n", + " 1984\n", + " Male\n", + " Yes\n", + " POINT (-122.263 37.79767)\n", + " POINT (-122.24878 37.79271)\n", " \n", " \n", " 6\n", - " 229264\n", - " 286\n", - " 2014-03-27 17:56:00+00:00\n", - " Embarcadero at Sansome\n", - " 60\n", - " 2014-03-27 18:01:00+00:00\n", - " Davis at Jackson\n", - " 42\n", - " 342\n", - " 94133\n", - " ...\n", - " Subscriber\n", - " <NA>\n", - " <NA>\n", - " <NA>\n", - " <NA>\n", - " <NA>\n", + " 201802201913231257\n", + " 596\n", + " 2018-02-20 19:13:23+00:00\n", + " 10th St at Fallon St\n", + " 201\n", + " 2018-02-20 19:23:19+00:00\n", + " 10th Ave at E 15th St\n", + " 222\n", + " 1257\n", " <NA>\n", + " ...\n", " <NA>\n", - " None\n", - " None\n", + " 37.797673\n", + " -122.262997\n", + " 37.792714\n", + " -122.24878\n", + " 1984\n", + " Male\n", + " Yes\n", + " POINT (-122.263 37.79767)\n", + " POINT (-122.24878 37.79271)\n", " \n", " \n", " 7\n", - " 352010\n", - " 3621\n", - " 2014-07-06 13:55:00+00:00\n", - " Embarcadero at Sansome\n", - " 60\n", - " 2014-07-06 14:55:00+00:00\n", - " Embarcadero at Sansome\n", - " 60\n", - " 390\n", - " 4038\n", - " ...\n", - " Customer\n", - " <NA>\n", - " <NA>\n", - " <NA>\n", - " <NA>\n", + " 201708242325001279\n", + " 1341\n", + " 2017-08-24 23:25:00+00:00\n", + " 10th St at Fallon St\n", + " 201\n", + " 2017-08-24 23:47:22+00:00\n", + " 10th Ave at E 15th St\n", + " 222\n", + " 1279\n", " <NA>\n", + " ...\n", " <NA>\n", + " 37.797673\n", + " -122.262997\n", + " 37.792714\n", + " -122.24878\n", + " 1969\n", + " Male\n", " <NA>\n", - " None\n", - " None\n", + " POINT (-122.263 37.79767)\n", + " POINT (-122.24878 37.79271)\n", " \n", " \n", " 8\n", - " 156255\n", - " 416\n", - " 2014-01-16 18:06:00+00:00\n", - " Embarcadero at Bryant\n", - " 54\n", - " 2014-01-16 18:13:00+00:00\n", - " San Francisco Caltrain (Townsend at 4th)\n", - " 70\n", - " 510\n", - " 94107\n", - " ...\n", - " Subscriber\n", - " <NA>\n", - " <NA>\n", - " <NA>\n", - " <NA>\n", + " 20170913210653295\n", + " 367\n", + " 2017-09-13 21:06:53+00:00\n", + " 10th St at Fallon St\n", + " 201\n", + " 2017-09-13 21:13:00+00:00\n", + " 10th Ave at E 15th St\n", + " 222\n", + " 295\n", " <NA>\n", + " ...\n", " <NA>\n", + " 37.797673\n", + " -122.262997\n", + " 37.792714\n", + " -122.24878\n", + " 1987\n", + " Male\n", " <NA>\n", - " None\n", - " None\n", + " POINT (-122.263 37.79767)\n", + " POINT (-122.24878 37.79271)\n", " \n", " \n", " 9\n", - " 1040197\n", - " 1054\n", - " 2015-12-15 18:05:00+00:00\n", - " Steuart at Market\n", - " 74\n", - " 2015-12-15 18:22:00+00:00\n", - " San Francisco Caltrain (Townsend at 4th)\n", - " 70\n", - " 700\n", - " 94111\n", - " ...\n", - " Subscriber\n", - " <NA>\n", - " <NA>\n", + " 201708192053311490\n", + " 743\n", + " 2017-08-19 20:53:31+00:00\n", + " 2nd Ave at E 18th St\n", + " 200\n", + " 2017-08-19 21:05:54+00:00\n", + " 10th Ave at E 15th St\n", + " 222\n", + " 1490\n", " <NA>\n", + " ...\n", " <NA>\n", + " 37.800214\n", + " -122.25381\n", + " 37.792714\n", + " -122.24878\n", " <NA>\n", " <NA>\n", " <NA>\n", - " None\n", - " None\n", + " POINT (-122.25381 37.80021)\n", + " POINT (-122.24878 37.79271)\n", " \n", " \n", " 10\n", - " 1152693\n", - " 562\n", - " 2016-04-07 08:18:00+00:00\n", - " San Francisco Caltrain (Townsend at 4th)\n", - " 70\n", - " 2016-04-07 08:27:00+00:00\n", - " Steuart at Market\n", - " 74\n", - " 419\n", - " 94158\n", - " ...\n", - " Subscriber\n", - " <NA>\n", - " <NA>\n", + " 20170810204454839\n", + " 1256\n", + " 2017-08-10 20:44:54+00:00\n", + " 2nd Ave at E 18th St\n", + " 200\n", + " 2017-08-10 21:05:50+00:00\n", + " 10th Ave at E 15th St\n", + " 222\n", + " 839\n", " <NA>\n", + " ...\n", " <NA>\n", + " 37.800214\n", + " -122.25381\n", + " 37.792714\n", + " -122.24878\n", " <NA>\n", " <NA>\n", " <NA>\n", - " None\n", - " None\n", + " POINT (-122.25381 37.80021)\n", + " POINT (-122.24878 37.79271)\n", " \n", " \n", " 11\n", - " 201804191735183401\n", - " 887\n", - " 2018-04-19 17:35:18+00:00\n", - " Montgomery St BART Station (Market St at 2nd St)\n", - " 21\n", - " 2018-04-19 17:50:06+00:00\n", - " Civic Center/UN Plaza BART Station (Market St ...\n", - " 44\n", - " 3401\n", + " 201711181823281960\n", + " 353\n", + " 2017-11-18 18:23:28+00:00\n", + " 2nd Ave at E 18th St\n", + " 200\n", + " 2017-11-18 18:29:22+00:00\n", + " 10th Ave at E 15th St\n", + " 222\n", + " 1960\n", " <NA>\n", " ...\n", " <NA>\n", - " 37.789625\n", - " -122.400811\n", - " 37.781074\n", - " -122.411738\n", - " 1979\n", + " 37.800214\n", + " -122.25381\n", + " 37.792714\n", + " -122.24878\n", + " 1988\n", " Male\n", - " No\n", - " POINT (-122.40081 37.78963)\n", - " POINT (-122.41174 37.78107)\n", + " <NA>\n", + " POINT (-122.25381 37.80021)\n", + " POINT (-122.24878 37.79271)\n", " \n", " \n", " 12\n", - " 209283\n", - " 943\n", - " 2014-03-11 09:01:00+00:00\n", - " South Van Ness at Market\n", - " 66\n", - " 2014-03-11 09:16:00+00:00\n", - " Temporary Transbay Terminal (Howard at Beale)\n", - " 55\n", - " 532\n", - " 94105\n", - " ...\n", - " Subscriber\n", - " <NA>\n", - " <NA>\n", - " <NA>\n", - " <NA>\n", - " <NA>\n", + " 201801111613101305\n", + " 858\n", + " 2018-01-11 16:13:10+00:00\n", + " Frank H Ogawa Plaza\n", + " 7\n", + " 2018-01-11 16:27:28+00:00\n", + " 10th Ave at E 15th St\n", + " 222\n", + " 1305\n", " <NA>\n", + " ...\n", " <NA>\n", - " None\n", - " None\n", + " 37.804562\n", + " -122.271738\n", + " 37.792714\n", + " -122.24878\n", + " 1984\n", + " Male\n", + " Yes\n", + " POINT (-122.27174 37.80456)\n", + " POINT (-122.24878 37.79271)\n", " \n", " \n", " 13\n", - " 201708281404312530\n", - " 389\n", - " 2017-08-28 14:04:31+00:00\n", - " 16th St at Prosper St\n", - " 105\n", - " 2017-08-28 14:11:00+00:00\n", - " Mission Playground\n", - " 121\n", - " 2530\n", + " 201712181738372587\n", + " 807\n", + " 2017-12-18 17:38:37+00:00\n", + " Frank H Ogawa Plaza\n", + " 7\n", + " 2017-12-18 17:52:04+00:00\n", + " 10th Ave at E 15th St\n", + " 222\n", + " 2587\n", " <NA>\n", " ...\n", " <NA>\n", - " 37.764285\n", - " -122.431804\n", - " 37.75921\n", - " -122.421339\n", - " 1981\n", + " 37.804562\n", + " -122.271738\n", + " 37.792714\n", + " -122.24878\n", + " 1984\n", " Male\n", " <NA>\n", - " POINT (-122.4318 37.76428)\n", - " POINT (-122.42134 37.75921)\n", + " POINT (-122.27174 37.80456)\n", + " POINT (-122.24878 37.79271)\n", " \n", " \n", " 14\n", - " 20171124115158841\n", - " 384\n", - " 2017-11-24 11:51:58+00:00\n", - " 2nd Ave at E 18th St\n", - " 200\n", - " 2017-11-24 11:58:23+00:00\n", - " El Embarcadero at Grand Ave\n", - " 197\n", - " 841\n", + " 201803161910283751\n", + " 564\n", + " 2018-03-16 19:10:28+00:00\n", + " Frank H Ogawa Plaza\n", + " 7\n", + " 2018-03-16 19:19:52+00:00\n", + " 10th Ave at E 15th St\n", + " 222\n", + " 3751\n", " <NA>\n", " ...\n", " <NA>\n", - " 37.800214\n", - " -122.25381\n", - " 37.808848\n", - " -122.24968\n", - " 1977\n", - " Female\n", - " <NA>\n", - " POINT (-122.25381 37.80021)\n", - " POINT (-122.24968 37.80885)\n", + " 37.804562\n", + " -122.271738\n", + " 37.792714\n", + " -122.24878\n", + " 1987\n", + " Male\n", + " No\n", + " POINT (-122.27174 37.80456)\n", + " POINT (-122.24878 37.79271)\n", " \n", " \n", " 15\n", - " 1321042\n", - " 874\n", - " 2016-08-18 08:14:00+00:00\n", - " San Francisco Caltrain (Townsend at 4th)\n", - " 70\n", - " 2016-08-18 08:29:00+00:00\n", - " Beale at Market\n", - " 56\n", - " 390\n", - " 95050\n", - " ...\n", - " Subscriber\n", - " <NA>\n", - " <NA>\n", - " <NA>\n", - " <NA>\n", - " <NA>\n", + " 201802241826551215\n", + " 1235\n", + " 2018-02-24 18:26:55+00:00\n", + " Frank H Ogawa Plaza\n", + " 7\n", + " 2018-02-24 18:47:31+00:00\n", + " 10th Ave at E 15th St\n", + " 222\n", + " 1215\n", " <NA>\n", + " ...\n", " <NA>\n", - " None\n", - " None\n", + " 37.804562\n", + " -122.271738\n", + " 37.792714\n", + " -122.24878\n", + " 1969\n", + " Male\n", + " No\n", + " POINT (-122.27174 37.80456)\n", + " POINT (-122.24878 37.79271)\n", " \n", " \n", " 16\n", - " 201712131325183120\n", - " 1376\n", - " 2017-12-13 13:25:18+00:00\n", - " Steuart St at Market St\n", - " 16\n", - " 2017-12-13 13:48:14+00:00\n", - " The Embarcadero at Sansome St\n", - " 6\n", - " 3120\n", + " 20171212152403227\n", + " 854\n", + " 2017-12-12 15:24:03+00:00\n", + " Frank H Ogawa Plaza\n", + " 7\n", + " 2017-12-12 15:38:17+00:00\n", + " 10th Ave at E 15th St\n", + " 222\n", + " 227\n", " <NA>\n", " ...\n", " <NA>\n", - " 37.79413\n", - " -122.39443\n", - " 37.80477\n", - " -122.403234\n", - " <NA>\n", - " <NA>\n", + " 37.804562\n", + " -122.271738\n", + " 37.792714\n", + " -122.24878\n", + " 1984\n", + " Male\n", " <NA>\n", - " POINT (-122.39443 37.79413)\n", - " POINT (-122.40323 37.80477)\n", + " POINT (-122.27174 37.80456)\n", + " POINT (-122.24878 37.79271)\n", " \n", " \n", " 17\n", - " 201708310827151646\n", - " 200\n", - " 2017-08-31 08:27:15+00:00\n", - " Powell St BART Station (Market St at 4th St)\n", - " 3\n", - " 2017-08-31 08:30:36+00:00\n", - " Montgomery St BART Station (Market St at 2nd St)\n", - " 21\n", - " 1646\n", + " 201803091621483450\n", + " 857\n", + " 2018-03-09 16:21:48+00:00\n", + " Frank H Ogawa Plaza\n", + " 7\n", + " 2018-03-09 16:36:06+00:00\n", + " 10th Ave at E 15th St\n", + " 222\n", + " 3450\n", " <NA>\n", " ...\n", " <NA>\n", - " 37.786375\n", - " -122.404904\n", - " 37.789625\n", - " -122.400811\n", - " 1988\n", + " 37.804562\n", + " -122.271738\n", + " 37.792714\n", + " -122.24878\n", + " 1984\n", " Male\n", - " <NA>\n", - " POINT (-122.4049 37.78638)\n", - " POINT (-122.40081 37.78963)\n", + " Yes\n", + " POINT (-122.27174 37.80456)\n", + " POINT (-122.24878 37.79271)\n", " \n", " \n", " 18\n", - " 201801251754102907\n", - " 1490\n", - " 2018-01-25 17:54:10+00:00\n", - " Esprit Park\n", - " 126\n", - " 2018-01-25 18:19:01+00:00\n", - " The Embarcadero at Vallejo St\n", - " 8\n", - " 2907\n", + " 201801021932232717\n", + " 914\n", + " 2018-01-02 19:32:23+00:00\n", + " Frank H Ogawa Plaza\n", + " 7\n", + " 2018-01-02 19:47:38+00:00\n", + " 10th Ave at E 15th St\n", + " 222\n", + " 2717\n", " <NA>\n", " ...\n", " <NA>\n", - " 37.761634\n", - " -122.390648\n", - " 37.799953\n", - " -122.398525\n", - " 1989\n", - " Female\n", - " No\n", - " POINT (-122.39065 37.76163)\n", - " POINT (-122.39852 37.79995)\n", + " 37.804562\n", + " -122.271738\n", + " 37.792714\n", + " -122.24878\n", + " 1984\n", + " Male\n", + " Yes\n", + " POINT (-122.27174 37.80456)\n", + " POINT (-122.24878 37.79271)\n", " \n", " \n", " 19\n", - " 201709230951302222\n", - " 319\n", - " 2017-09-23 09:51:30+00:00\n", - " 7th St at Brannan St\n", - " 79\n", - " 2017-09-23 09:56:49+00:00\n", - " San Francisco Caltrain (Townsend St at 4th St)\n", - " 30\n", - " 2222\n", + " 201803131437033724\n", + " 917\n", + " 2018-03-13 14:37:03+00:00\n", + " Grand Ave at Webster St\n", + " 181\n", + " 2018-03-13 14:52:20+00:00\n", + " 10th Ave at E 15th St\n", + " 222\n", + " 3724\n", " <NA>\n", " ...\n", " <NA>\n", - " 37.773492\n", - " -122.403672\n", - " 37.776598\n", - " -122.395282\n", - " 1975\n", + " 37.811377\n", + " -122.265192\n", + " 37.792714\n", + " -122.24878\n", + " 1989\n", " Male\n", - " <NA>\n", - " POINT (-122.40367 37.77349)\n", - " POINT (-122.39528 37.7766)\n", + " No\n", + " POINT (-122.26519 37.81138)\n", + " POINT (-122.24878 37.79271)\n", " \n", " \n", " 20\n", - " 20180220172815415\n", - " 4009\n", - " 2018-02-20 17:28:15+00:00\n", - " Franklin St at 9th St\n", - " 162\n", - " 2018-02-20 18:35:05+00:00\n", - " Telegraph Ave at 27th St\n", - " 179\n", - " 415\n", + " 20170930184510496\n", + " 1367\n", + " 2017-09-30 18:45:10+00:00\n", + " Lake Merritt BART Station\n", + " 163\n", + " 2017-09-30 19:07:58+00:00\n", + " 10th Ave at E 15th St\n", + " 222\n", + " 496\n", " <NA>\n", " ...\n", " <NA>\n", - " 37.800516\n", - " -122.27208\n", - " 37.816073\n", - " -122.267886\n", - " 1973\n", - " Male\n", - " Yes\n", - " POINT (-122.27208 37.80052)\n", - " POINT (-122.26789 37.81607)\n", + " 37.79732\n", + " -122.26532\n", + " 37.792714\n", + " -122.24878\n", + " <NA>\n", + " <NA>\n", + " <NA>\n", + " POINT (-122.26532 37.79732)\n", + " POINT (-122.24878 37.79271)\n", " \n", " \n", " 21\n", - " 201710191714443003\n", - " 691\n", - " 2017-10-19 17:14:44+00:00\n", - " Harrison St at 20th St\n", - " 129\n", - " 2017-10-19 17:26:16+00:00\n", - " Valencia St at 22nd St\n", - " 133\n", - " 3003\n", + " 201712061755593426\n", + " 519\n", + " 2017-12-06 17:55:59+00:00\n", + " Lake Merritt BART Station\n", + " 163\n", + " 2017-12-06 18:04:39+00:00\n", + " 10th Ave at E 15th St\n", + " 222\n", + " 3426\n", " <NA>\n", " ...\n", " <NA>\n", - " 37.758862\n", - " -122.412544\n", - " 37.755213\n", - " -122.420975\n", - " 1958\n", + " 37.79732\n", + " -122.26532\n", + " 37.792714\n", + " -122.24878\n", + " 1986\n", " Male\n", " <NA>\n", - " POINT (-122.41254 37.75886)\n", - " POINT (-122.42098 37.75521)\n", + " POINT (-122.26532 37.79732)\n", + " POINT (-122.24878 37.79271)\n", " \n", " \n", " 22\n", - " 595146\n", - " 453\n", - " 2015-01-07 18:34:00+00:00\n", - " Market at 10th\n", - " 67\n", - " 2015-01-07 18:42:00+00:00\n", - " Townsend at 7th\n", - " 65\n", - " 421\n", - " 95014\n", - " ...\n", - " Subscriber\n", - " <NA>\n", - " <NA>\n", - " <NA>\n", - " <NA>\n", + " 201711062204002182\n", + " 420\n", + " 2017-11-06 22:04:00+00:00\n", + " Lake Merritt BART Station\n", + " 163\n", + " 2017-11-06 22:11:00+00:00\n", + " 10th Ave at E 15th St\n", + " 222\n", + " 2182\n", " <NA>\n", + " ...\n", " <NA>\n", + " 37.79732\n", + " -122.26532\n", + " 37.792714\n", + " -122.24878\n", + " 1992\n", + " Male\n", " <NA>\n", - " None\n", - " None\n", + " POINT (-122.26532 37.79732)\n", + " POINT (-122.24878 37.79271)\n", " \n", " \n", " 23\n", - " 201708290913502454\n", - " 788\n", - " 2017-08-29 09:13:50+00:00\n", - " San Francisco Caltrain (Townsend St at 4th St)\n", - " 30\n", - " 2017-08-29 09:26:58+00:00\n", - " The Embarcadero at Vallejo St\n", - " 8\n", - " 2454\n", + " 201709122036152238\n", + " 612\n", + " 2017-09-12 20:36:15+00:00\n", + " Lake Merritt BART Station\n", + " 163\n", + " 2017-09-12 20:46:27+00:00\n", + " 10th Ave at E 15th St\n", + " 222\n", + " 2238\n", " <NA>\n", " ...\n", " <NA>\n", - " 37.776598\n", - " -122.395282\n", - " 37.799953\n", - " -122.398525\n", - " 1979\n", + " 37.79732\n", + " -122.26532\n", + " 37.792714\n", + " -122.24878\n", + " 1984\n", " Male\n", " <NA>\n", - " POINT (-122.39528 37.7766)\n", - " POINT (-122.39852 37.79995)\n", + " POINT (-122.26532 37.79732)\n", + " POINT (-122.24878 37.79271)\n", " \n", " \n", " 24\n", - " 201712271150433036\n", - " 150\n", - " 2017-12-27 11:50:43+00:00\n", - " Powell St BART Station (Market St at 4th St)\n", - " 3\n", - " 2017-12-27 11:53:14+00:00\n", - " 4th St at Harrison St\n", - " 47\n", - " 3036\n", + " 201712062310481332\n", + " 442\n", + " 2017-12-06 23:10:48+00:00\n", + " Lake Merritt BART Station\n", + " 163\n", + " 2017-12-06 23:18:11+00:00\n", + " 10th Ave at E 15th St\n", + " 222\n", + " 1332\n", " <NA>\n", " ...\n", " <NA>\n", - " 37.786375\n", - " -122.404904\n", - " 37.780955\n", - " -122.399749\n", - " 1989\n", + " 37.79732\n", + " -122.26532\n", + " 37.792714\n", + " -122.24878\n", + " 1981\n", " Male\n", " <NA>\n", - " POINT (-122.4049 37.78638)\n", - " POINT (-122.39975 37.78095)\n", + " POINT (-122.26532 37.79732)\n", + " POINT (-122.24878 37.79271)\n", " \n", " \n", "\n", @@ -742,221 +746,194 @@ "[1947417 rows x 21 columns in total]" ], "text/plain": [ - " trip_id duration_sec start_date \\\n", - "0 1304531 597 2016-08-05 10:55:00+00:00 \n", - "1 184870 403 2014-02-14 14:50:00+00:00 \n", - "2 20170702115603836 16695 2017-07-02 11:56:03+00:00 \n", - "3 1066810 953 2016-01-21 08:24:00+00:00 \n", - "4 220481 679 2014-03-19 19:20:00+00:00 \n", - "5 738474 358 2015-04-23 16:45:00+00:00 \n", - "6 229264 286 2014-03-27 17:56:00+00:00 \n", - "7 352010 3621 2014-07-06 13:55:00+00:00 \n", - "8 156255 416 2014-01-16 18:06:00+00:00 \n", - "9 1040197 1054 2015-12-15 18:05:00+00:00 \n", - "10 1152693 562 2016-04-07 08:18:00+00:00 \n", - "11 201804191735183401 887 2018-04-19 17:35:18+00:00 \n", - "12 209283 943 2014-03-11 09:01:00+00:00 \n", - "13 201708281404312530 389 2017-08-28 14:04:31+00:00 \n", - "14 20171124115158841 384 2017-11-24 11:51:58+00:00 \n", - "15 1321042 874 2016-08-18 08:14:00+00:00 \n", - "16 201712131325183120 1376 2017-12-13 13:25:18+00:00 \n", - "17 201708310827151646 200 2017-08-31 08:27:15+00:00 \n", - "18 201801251754102907 1490 2018-01-25 17:54:10+00:00 \n", - "19 201709230951302222 319 2017-09-23 09:51:30+00:00 \n", - "20 20180220172815415 4009 2018-02-20 17:28:15+00:00 \n", - "21 201710191714443003 691 2017-10-19 17:14:44+00:00 \n", - "22 595146 453 2015-01-07 18:34:00+00:00 \n", - "23 201708290913502454 788 2017-08-29 09:13:50+00:00 \n", - "24 201712271150433036 150 2017-12-27 11:50:43+00:00 \n", + " trip_id duration_sec start_date \\\n", + "201802092135083596 788 2018-02-09 21:35:08+00:00 \n", + " 20171217135737144 1072 2017-12-17 13:57:37+00:00 \n", + "201803261642393539 486 2018-03-26 16:42:39+00:00 \n", + "201802281657253632 560 2018-02-28 16:57:25+00:00 \n", + "201708152357422491 965 2017-08-15 23:57:42+00:00 \n", + "201801161800473291 489 2018-01-16 18:00:47+00:00 \n", + "201802201913231257 596 2018-02-20 19:13:23+00:00 \n", + "201708242325001279 1341 2017-08-24 23:25:00+00:00 \n", + " 20170913210653295 367 2017-09-13 21:06:53+00:00 \n", + "201708192053311490 743 2017-08-19 20:53:31+00:00 \n", + " 20170810204454839 1256 2017-08-10 20:44:54+00:00 \n", + "201711181823281960 353 2017-11-18 18:23:28+00:00 \n", + "201801111613101305 858 2018-01-11 16:13:10+00:00 \n", + "201712181738372587 807 2017-12-18 17:38:37+00:00 \n", + "201803161910283751 564 2018-03-16 19:10:28+00:00 \n", + "201802241826551215 1235 2018-02-24 18:26:55+00:00 \n", + " 20171212152403227 854 2017-12-12 15:24:03+00:00 \n", + "201803091621483450 857 2018-03-09 16:21:48+00:00 \n", + "201801021932232717 914 2018-01-02 19:32:23+00:00 \n", + "201803131437033724 917 2018-03-13 14:37:03+00:00 \n", + " 20170930184510496 1367 2017-09-30 18:45:10+00:00 \n", + "201712061755593426 519 2017-12-06 17:55:59+00:00 \n", + "201711062204002182 420 2017-11-06 22:04:00+00:00 \n", + "201709122036152238 612 2017-09-12 20:36:15+00:00 \n", + "201712062310481332 442 2017-12-06 23:10:48+00:00 \n", "\n", - " start_station_name start_station_id \\\n", - "0 San Francisco Caltrain 2 (330 Townsend) 69 \n", - "1 Howard at 2nd 63 \n", - "2 Union Square (Powell St at Post St) 324 \n", - "3 Civic Center BART (7th at Market) 72 \n", - "4 San Francisco Caltrain 2 (330 Townsend) 69 \n", - "5 2nd at Folsom 62 \n", - "6 Embarcadero at Sansome 60 \n", - "7 Embarcadero at Sansome 60 \n", - "8 Embarcadero at Bryant 54 \n", - "9 Steuart at Market 74 \n", - "10 San Francisco Caltrain (Townsend at 4th) 70 \n", - "11 Montgomery St BART Station (Market St at 2nd St) 21 \n", - "12 South Van Ness at Market 66 \n", - "13 16th St at Prosper St 105 \n", - "14 2nd Ave at E 18th St 200 \n", - "15 San Francisco Caltrain (Townsend at 4th) 70 \n", - "16 Steuart St at Market St 16 \n", - "17 Powell St BART Station (Market St at 4th St) 3 \n", - "18 Esprit Park 126 \n", - "19 7th St at Brannan St 79 \n", - "20 Franklin St at 9th St 162 \n", - "21 Harrison St at 20th St 129 \n", - "22 Market at 10th 67 \n", - "23 San Francisco Caltrain (Townsend St at 4th St) 30 \n", - "24 Powell St BART Station (Market St at 4th St) 3 \n", + " start_station_name start_station_id end_date \\\n", + " 10th Ave at E 15th St 222 2018-02-09 21:48:17+00:00 \n", + " 10th Ave at E 15th St 222 2017-12-17 14:15:30+00:00 \n", + " 10th St at Fallon St 201 2018-03-26 16:50:46+00:00 \n", + " 10th St at Fallon St 201 2018-02-28 17:06:46+00:00 \n", + " 10th St at Fallon St 201 2017-08-16 00:13:48+00:00 \n", + " 10th St at Fallon St 201 2018-01-16 18:08:56+00:00 \n", + " 10th St at Fallon St 201 2018-02-20 19:23:19+00:00 \n", + " 10th St at Fallon St 201 2017-08-24 23:47:22+00:00 \n", + " 10th St at Fallon St 201 2017-09-13 21:13:00+00:00 \n", + " 2nd Ave at E 18th St 200 2017-08-19 21:05:54+00:00 \n", + " 2nd Ave at E 18th St 200 2017-08-10 21:05:50+00:00 \n", + " 2nd Ave at E 18th St 200 2017-11-18 18:29:22+00:00 \n", + " Frank H Ogawa Plaza 7 2018-01-11 16:27:28+00:00 \n", + " Frank H Ogawa Plaza 7 2017-12-18 17:52:04+00:00 \n", + " Frank H Ogawa Plaza 7 2018-03-16 19:19:52+00:00 \n", + " Frank H Ogawa Plaza 7 2018-02-24 18:47:31+00:00 \n", + " Frank H Ogawa Plaza 7 2017-12-12 15:38:17+00:00 \n", + " Frank H Ogawa Plaza 7 2018-03-09 16:36:06+00:00 \n", + " Frank H Ogawa Plaza 7 2018-01-02 19:47:38+00:00 \n", + " Grand Ave at Webster St 181 2018-03-13 14:52:20+00:00 \n", + "Lake Merritt BART Station 163 2017-09-30 19:07:58+00:00 \n", + "Lake Merritt BART Station 163 2017-12-06 18:04:39+00:00 \n", + "Lake Merritt BART Station 163 2017-11-06 22:11:00+00:00 \n", + "Lake Merritt BART Station 163 2017-09-12 20:46:27+00:00 \n", + "Lake Merritt BART Station 163 2017-12-06 23:18:11+00:00 \n", "\n", - " end_date \\\n", - "0 2016-08-05 11:05:00+00:00 \n", - "1 2014-02-14 14:56:00+00:00 \n", - "2 2017-07-02 16:34:19+00:00 \n", - "3 2016-01-21 08:40:00+00:00 \n", - "4 2014-03-19 19:31:00+00:00 \n", - "5 2015-04-23 16:51:00+00:00 \n", - "6 2014-03-27 18:01:00+00:00 \n", - "7 2014-07-06 14:55:00+00:00 \n", - "8 2014-01-16 18:13:00+00:00 \n", - "9 2015-12-15 18:22:00+00:00 \n", - "10 2016-04-07 08:27:00+00:00 \n", - "11 2018-04-19 17:50:06+00:00 \n", - "12 2014-03-11 09:16:00+00:00 \n", - "13 2017-08-28 14:11:00+00:00 \n", - "14 2017-11-24 11:58:23+00:00 \n", - "15 2016-08-18 08:29:00+00:00 \n", - "16 2017-12-13 13:48:14+00:00 \n", - "17 2017-08-31 08:30:36+00:00 \n", - "18 2018-01-25 18:19:01+00:00 \n", - "19 2017-09-23 09:56:49+00:00 \n", - "20 2018-02-20 18:35:05+00:00 \n", - "21 2017-10-19 17:26:16+00:00 \n", - "22 2015-01-07 18:42:00+00:00 \n", - "23 2017-08-29 09:26:58+00:00 \n", - "24 2017-12-27 11:53:14+00:00 \n", + " end_station_name end_station_id bike_number zip_code ... \\\n", + "10th Ave at E 15th St 222 3596 ... \n", + "10th Ave at E 15th St 222 144 ... \n", + "10th Ave at E 15th St 222 3539 ... \n", + "10th Ave at E 15th St 222 3632 ... \n", + "10th Ave at E 15th St 222 2491 ... \n", + "10th Ave at E 15th St 222 3291 ... \n", + "10th Ave at E 15th St 222 1257 ... \n", + "10th Ave at E 15th St 222 1279 ... \n", + "10th Ave at E 15th St 222 295 ... \n", + "10th Ave at E 15th St 222 1490 ... \n", + "10th Ave at E 15th St 222 839 ... \n", + "10th Ave at E 15th St 222 1960 ... \n", + "10th Ave at E 15th St 222 1305 ... \n", + "10th Ave at E 15th St 222 2587 ... \n", + "10th Ave at E 15th St 222 3751 ... \n", + "10th Ave at E 15th St 222 1215 ... \n", + "10th Ave at E 15th St 222 227 ... \n", + "10th Ave at E 15th St 222 3450 ... \n", + "10th Ave at E 15th St 222 2717 ... \n", + "10th Ave at E 15th St 222 3724 ... \n", + "10th Ave at E 15th St 222 496 ... \n", + "10th Ave at E 15th St 222 3426 ... \n", + "10th Ave at E 15th St 222 2182 ... \n", + "10th Ave at E 15th St 222 2238 ... \n", + "10th Ave at E 15th St 222 1332 ... \n", "\n", - " end_station_name end_station_id \\\n", - "0 Powell Street BART 39 \n", - "1 Commercial at Montgomery 45 \n", - "2 Union Square (Powell St at Post St) 324 \n", - "3 Embarcadero at Sansome 60 \n", - "4 Civic Center BART (7th at Market) 72 \n", - "5 Steuart at Market 74 \n", - "6 Davis at Jackson 42 \n", - "7 Embarcadero at Sansome 60 \n", - "8 San Francisco Caltrain (Townsend at 4th) 70 \n", - "9 San Francisco Caltrain (Townsend at 4th) 70 \n", - "10 Steuart at Market 74 \n", - "11 Civic Center/UN Plaza BART Station (Market St ... 44 \n", - "12 Temporary Transbay Terminal (Howard at Beale) 55 \n", - "13 Mission Playground 121 \n", - "14 El Embarcadero at Grand Ave 197 \n", - "15 Beale at Market 56 \n", - "16 The Embarcadero at Sansome St 6 \n", - "17 Montgomery St BART Station (Market St at 2nd St) 21 \n", - "18 The Embarcadero at Vallejo St 8 \n", - "19 San Francisco Caltrain (Townsend St at 4th St) 30 \n", - "20 Telegraph Ave at 27th St 179 \n", - "21 Valencia St at 22nd St 133 \n", - "22 Townsend at 7th 65 \n", - "23 The Embarcadero at Vallejo St 8 \n", - "24 4th St at Harrison St 47 \n", + "c_subscription_type start_station_latitude start_station_longitude \\\n", + " 37.792714 -122.24878 \n", + " 37.792714 -122.24878 \n", + " 37.797673 -122.262997 \n", + " 37.797673 -122.262997 \n", + " 37.797673 -122.262997 \n", + " 37.797673 -122.262997 \n", + " 37.797673 -122.262997 \n", + " 37.797673 -122.262997 \n", + " 37.797673 -122.262997 \n", + " 37.800214 -122.25381 \n", + " 37.800214 -122.25381 \n", + " 37.800214 -122.25381 \n", + " 37.804562 -122.271738 \n", + " 37.804562 -122.271738 \n", + " 37.804562 -122.271738 \n", + " 37.804562 -122.271738 \n", + " 37.804562 -122.271738 \n", + " 37.804562 -122.271738 \n", + " 37.804562 -122.271738 \n", + " 37.811377 -122.265192 \n", + " 37.79732 -122.26532 \n", + " 37.79732 -122.26532 \n", + " 37.79732 -122.26532 \n", + " 37.79732 -122.26532 \n", + " 37.79732 -122.26532 \n", "\n", - " bike_number zip_code ... c_subscription_type start_station_latitude \\\n", - "0 214 95121 ... Subscriber \n", - "1 342 94122 ... Subscriber \n", - "2 836 ... 37.7883 \n", - "3 212 94103 ... Subscriber \n", - "4 478 94107 ... Subscriber \n", - "5 443 94105 ... Subscriber \n", - "6 342 94133 ... Subscriber \n", - "7 390 4038 ... Customer \n", - "8 510 94107 ... Subscriber \n", - "9 700 94111 ... Subscriber \n", - "10 419 94158 ... Subscriber \n", - "11 3401 ... 37.789625 \n", - "12 532 94105 ... Subscriber \n", - "13 2530 ... 37.764285 \n", - "14 841 ... 37.800214 \n", - "15 390 95050 ... Subscriber \n", - "16 3120 ... 37.79413 \n", - "17 1646 ... 37.786375 \n", - "18 2907 ... 37.761634 \n", - "19 2222 ... 37.773492 \n", - "20 415 ... 37.800516 \n", - "21 3003 ... 37.758862 \n", - "22 421 95014 ... Subscriber \n", - "23 2454 ... 37.776598 \n", - "24 3036 ... 37.786375 \n", + " end_station_latitude end_station_longitude member_birth_year \\\n", + " 37.792714 -122.24878 1984 \n", + " 37.792714 -122.24878 1984 \n", + " 37.792714 -122.24878 1984 \n", + " 37.792714 -122.24878 1984 \n", + " 37.792714 -122.24878 \n", + " 37.792714 -122.24878 1984 \n", + " 37.792714 -122.24878 1984 \n", + " 37.792714 -122.24878 1969 \n", + " 37.792714 -122.24878 1987 \n", + " 37.792714 -122.24878 \n", + " 37.792714 -122.24878 \n", + " 37.792714 -122.24878 1988 \n", + " 37.792714 -122.24878 1984 \n", + " 37.792714 -122.24878 1984 \n", + " 37.792714 -122.24878 1987 \n", + " 37.792714 -122.24878 1969 \n", + " 37.792714 -122.24878 1984 \n", + " 37.792714 -122.24878 1984 \n", + " 37.792714 -122.24878 1984 \n", + " 37.792714 -122.24878 1989 \n", + " 37.792714 -122.24878 \n", + " 37.792714 -122.24878 1986 \n", + " 37.792714 -122.24878 1992 \n", + " 37.792714 -122.24878 1984 \n", + " 37.792714 -122.24878 1981 \n", "\n", - " start_station_longitude end_station_latitude end_station_longitude \\\n", - "0 \n", - "1 \n", - "2 -122.408531 37.7883 -122.408531 \n", - "3 \n", - "4 \n", - "5 \n", - "6 \n", - "7 \n", - "8 \n", - "9 \n", - "10 \n", - "11 -122.400811 37.781074 -122.411738 \n", - "12 \n", - "13 -122.431804 37.75921 -122.421339 \n", - "14 -122.25381 37.808848 -122.24968 \n", - "15 \n", - "16 -122.39443 37.80477 -122.403234 \n", - "17 -122.404904 37.789625 -122.400811 \n", - "18 -122.390648 37.799953 -122.398525 \n", - "19 -122.403672 37.776598 -122.395282 \n", - "20 -122.27208 37.816073 -122.267886 \n", - "21 -122.412544 37.755213 -122.420975 \n", - "22 \n", - "23 -122.395282 37.799953 -122.398525 \n", - "24 -122.404904 37.780955 -122.399749 \n", + " member_gender bike_share_for_all_trip start_station_geom \\\n", + " Male Yes POINT (-122.24878 37.79271) \n", + " Male POINT (-122.24878 37.79271) \n", + " Male Yes POINT (-122.263 37.79767) \n", + " Male Yes POINT (-122.263 37.79767) \n", + " POINT (-122.263 37.79767) \n", + " Male Yes POINT (-122.263 37.79767) \n", + " Male Yes POINT (-122.263 37.79767) \n", + " Male POINT (-122.263 37.79767) \n", + " Male POINT (-122.263 37.79767) \n", + " POINT (-122.25381 37.80021) \n", + " POINT (-122.25381 37.80021) \n", + " Male POINT (-122.25381 37.80021) \n", + " Male Yes POINT (-122.27174 37.80456) \n", + " Male POINT (-122.27174 37.80456) \n", + " Male No POINT (-122.27174 37.80456) \n", + " Male No POINT (-122.27174 37.80456) \n", + " Male POINT (-122.27174 37.80456) \n", + " Male Yes POINT (-122.27174 37.80456) \n", + " Male Yes POINT (-122.27174 37.80456) \n", + " Male No POINT (-122.26519 37.81138) \n", + " POINT (-122.26532 37.79732) \n", + " Male POINT (-122.26532 37.79732) \n", + " Male POINT (-122.26532 37.79732) \n", + " Male POINT (-122.26532 37.79732) \n", + " Male POINT (-122.26532 37.79732) \n", "\n", - " member_birth_year member_gender bike_share_for_all_trip \\\n", - "0 \n", - "1 \n", - "2 \n", - "3 \n", - "4 \n", - "5 \n", - "6 \n", - "7 \n", - "8 \n", - "9 \n", - "10 \n", - "11 1979 Male No \n", - "12 \n", - "13 1981 Male \n", - "14 1977 Female \n", - "15 \n", - "16 \n", - "17 1988 Male \n", - "18 1989 Female No \n", - "19 1975 Male \n", - "20 1973 Male Yes \n", - "21 1958 Male \n", - "22 \n", - "23 1979 Male \n", - "24 1989 Male \n", - "\n", - " start_station_geom end_station_geom \n", - "0 None None \n", - "1 None None \n", - "2 POINT (-122.40853 37.7883) POINT (-122.40853 37.7883) \n", - "3 None None \n", - "4 None None \n", - "5 None None \n", - "6 None None \n", - "7 None None \n", - "8 None None \n", - "9 None None \n", - "10 None None \n", - "11 POINT (-122.40081 37.78963) POINT (-122.41174 37.78107) \n", - "12 None None \n", - "13 POINT (-122.4318 37.76428) POINT (-122.42134 37.75921) \n", - "14 POINT (-122.25381 37.80021) POINT (-122.24968 37.80885) \n", - "15 None None \n", - "16 POINT (-122.39443 37.79413) POINT (-122.40323 37.80477) \n", - "17 POINT (-122.4049 37.78638) POINT (-122.40081 37.78963) \n", - "18 POINT (-122.39065 37.76163) POINT (-122.39852 37.79995) \n", - "19 POINT (-122.40367 37.77349) POINT (-122.39528 37.7766) \n", - "20 POINT (-122.27208 37.80052) POINT (-122.26789 37.81607) \n", - "21 POINT (-122.41254 37.75886) POINT (-122.42098 37.75521) \n", - "22 None None \n", - "23 POINT (-122.39528 37.7766) POINT (-122.39852 37.79995) \n", - "24 POINT (-122.4049 37.78638) POINT (-122.39975 37.78095) \n", + " end_station_geom \n", + "POINT (-122.24878 37.79271) \n", + "POINT (-122.24878 37.79271) \n", + "POINT (-122.24878 37.79271) \n", + "POINT (-122.24878 37.79271) \n", + "POINT (-122.24878 37.79271) \n", + "POINT (-122.24878 37.79271) \n", + "POINT (-122.24878 37.79271) \n", + "POINT (-122.24878 37.79271) \n", + "POINT (-122.24878 37.79271) \n", + "POINT (-122.24878 37.79271) \n", + "POINT (-122.24878 37.79271) \n", + "POINT (-122.24878 37.79271) \n", + "POINT (-122.24878 37.79271) \n", + "POINT (-122.24878 37.79271) \n", + "POINT (-122.24878 37.79271) \n", + "POINT (-122.24878 37.79271) \n", + "POINT (-122.24878 37.79271) \n", + "POINT (-122.24878 37.79271) \n", + "POINT (-122.24878 37.79271) \n", + "POINT (-122.24878 37.79271) \n", + "POINT (-122.24878 37.79271) \n", + "POINT (-122.24878 37.79271) \n", + "POINT (-122.24878 37.79271) \n", + "POINT (-122.24878 37.79271) \n", + "POINT (-122.24878 37.79271) \n", "...\n", "\n", "[1947417 rows x 21 columns]" @@ -1167,32 +1144,32 @@ "[2842 rows x 2 columns in total]" ], "text/plain": [ - " trip_hour num_trips\n", - "0 2018-01-01 00:00:00+00:00 20\n", - "1 2018-01-01 01:00:00+00:00 25\n", - "2 2018-01-01 02:00:00+00:00 13\n", - "3 2018-01-01 03:00:00+00:00 11\n", - "4 2018-01-01 05:00:00+00:00 4\n", - "5 2018-01-01 06:00:00+00:00 8\n", - "6 2018-01-01 07:00:00+00:00 8\n", - "7 2018-01-01 08:00:00+00:00 20\n", - "8 2018-01-01 09:00:00+00:00 30\n", - "9 2018-01-01 10:00:00+00:00 41\n", - "10 2018-01-01 11:00:00+00:00 45\n", - "11 2018-01-01 12:00:00+00:00 54\n", - "12 2018-01-01 13:00:00+00:00 57\n", - "13 2018-01-01 14:00:00+00:00 68\n", - "14 2018-01-01 15:00:00+00:00 86\n", - "15 2018-01-01 16:00:00+00:00 72\n", - "16 2018-01-01 17:00:00+00:00 72\n", - "17 2018-01-01 18:00:00+00:00 47\n", - "18 2018-01-01 19:00:00+00:00 32\n", - "19 2018-01-01 20:00:00+00:00 34\n", - "20 2018-01-01 21:00:00+00:00 27\n", - "21 2018-01-01 22:00:00+00:00 15\n", - "22 2018-01-01 23:00:00+00:00 6\n", - "23 2018-01-02 00:00:00+00:00 2\n", - "24 2018-01-02 01:00:00+00:00 1\n", + " trip_hour num_trips\n", + "2018-01-01 00:00:00+00:00 20\n", + "2018-01-01 01:00:00+00:00 25\n", + "2018-01-01 02:00:00+00:00 13\n", + "2018-01-01 03:00:00+00:00 11\n", + "2018-01-01 05:00:00+00:00 4\n", + "2018-01-01 06:00:00+00:00 8\n", + "2018-01-01 07:00:00+00:00 8\n", + "2018-01-01 08:00:00+00:00 20\n", + "2018-01-01 09:00:00+00:00 30\n", + "2018-01-01 10:00:00+00:00 41\n", + "2018-01-01 11:00:00+00:00 45\n", + "2018-01-01 12:00:00+00:00 54\n", + "2018-01-01 13:00:00+00:00 57\n", + "2018-01-01 14:00:00+00:00 68\n", + "2018-01-01 15:00:00+00:00 86\n", + "2018-01-01 16:00:00+00:00 72\n", + "2018-01-01 17:00:00+00:00 72\n", + "2018-01-01 18:00:00+00:00 47\n", + "2018-01-01 19:00:00+00:00 32\n", + "2018-01-01 20:00:00+00:00 34\n", + "2018-01-01 21:00:00+00:00 27\n", + "2018-01-01 22:00:00+00:00 15\n", + "2018-01-01 23:00:00+00:00 6\n", + "2018-01-02 00:00:00+00:00 2\n", + "2018-01-02 01:00:00+00:00 1\n", "...\n", "\n", "[2842 rows x 2 columns]" @@ -1253,227 +1230,227 @@ " \n", " \n", " 0\n", - " 2018-05-05 01:00:00+00:00\n", - " 50.123672\n", + " 2018-04-26 11:00:00+00:00\n", + " 204.291275\n", " 0.95\n", - " -13.062586\n", - " 113.309931\n", + " 149.151441\n", + " 259.431109\n", " \n", " \n", " \n", " 1\n", - " 2018-05-05 07:00:00+00:00\n", - " 103.112846\n", + " 2018-04-27 13:00:00+00:00\n", + " 196.034332\n", " 0.95\n", - " 33.725954\n", - " 172.499739\n", + " 203.125978\n", + " 188.942686\n", " \n", " \n", " \n", " 2\n", - " 2018-05-03 15:00:00+00:00\n", - " 230.49147\n", + " 2018-04-27 20:00:00+00:00\n", + " 133.339386\n", " 0.95\n", - " 152.635986\n", - " 308.346954\n", + " 132.658946\n", + " 134.019826\n", " \n", " \n", " \n", " 3\n", - " 2018-05-02 08:00:00+00:00\n", - " 737.477356\n", + " 2018-04-28 05:00:00+00:00\n", + " -27.321686\n", " 0.95\n", - " 562.979208\n", - " 911.975504\n", + " -13.918083\n", + " -40.725288\n", " \n", " \n", " \n", " 4\n", - " 2018-05-01 08:00:00+00:00\n", - " 679.980469\n", + " 2018-04-29 12:00:00+00:00\n", + " 117.657822\n", " 0.95\n", - " 479.980134\n", - " 879.980803\n", + " 58.020439\n", + " 177.295205\n", " \n", " \n", " \n", " 5\n", - " 2018-05-06 18:00:00+00:00\n", - " 136.80835\n", + " 2018-04-24 10:00:00+00:00\n", + " 221.464111\n", " 0.95\n", - " -13.813863\n", - " 287.430562\n", + " 154.598621\n", + " 288.329602\n", " \n", " \n", " \n", " 6\n", - " 2018-05-01 11:00:00+00:00\n", - " 120.364288\n", + " 2018-04-24 23:00:00+00:00\n", + " 56.203827\n", " 0.95\n", - " 52.778249\n", - " 187.950328\n", + " 42.096868\n", + " 70.310786\n", " \n", " \n", " \n", " 7\n", - " 2018-05-06 22:00:00+00:00\n", - " 64.722443\n", + " 2018-04-29 07:00:00+00:00\n", + " -14.801514\n", " 0.95\n", - " -55.555842\n", - " 185.000727\n", + " -48.905982\n", + " 19.302954\n", " \n", " \n", " \n", " 8\n", - " 2018-05-03 02:00:00+00:00\n", - " 42.689804\n", + " 2018-04-24 22:00:00+00:00\n", + " 58.174316\n", " 0.95\n", - " 33.258414\n", - " 52.121194\n", + " 85.290985\n", + " 31.057648\n", " \n", " \n", " \n", " 9\n", - " 2018-05-07 17:00:00+00:00\n", - " 594.999084\n", + " 2018-04-25 08:00:00+00:00\n", + " 666.577393\n", " 0.95\n", - " 346.917217\n", - " 843.080952\n", + " 518.655663\n", + " 814.499122\n", " \n", " \n", " \n", " 10\n", - " 2018-05-03 20:00:00+00:00\n", - " 161.822281\n", + " 2018-04-29 01:00:00+00:00\n", + " 40.19632\n", " 0.95\n", - " 100.005942\n", - " 223.63862\n", + " 48.957491\n", + " 31.435148\n", " \n", " \n", " \n", " 11\n", - " 2018-05-01 20:00:00+00:00\n", - " 173.801025\n", + " 2018-04-29 02:00:00+00:00\n", + " 29.00975\n", " 0.95\n", - " 56.460376\n", - " 291.141675\n", + " -8.137303\n", + " 66.156804\n", " \n", " \n", " \n", " 12\n", - " 2018-05-04 17:00:00+00:00\n", - " 485.449829\n", + " 2018-04-30 18:00:00+00:00\n", + " 488.885284\n", " 0.95\n", - " 356.038539\n", - " 614.86112\n", + " 315.531321\n", + " 662.239248\n", " \n", " \n", " \n", " 13\n", - " 2018-05-04 09:00:00+00:00\n", - " 418.055878\n", + " 2018-04-27 10:00:00+00:00\n", + " 188.79628\n", " 0.95\n", - " 281.134736\n", - " 554.977019\n", + " 157.126395\n", + " 220.466165\n", " \n", " \n", " \n", " 14\n", - " 2018-05-07 03:00:00+00:00\n", - " 24.735134\n", + " 2018-04-24 21:00:00+00:00\n", + " 107.512665\n", " 0.95\n", - " -100.607727\n", - " 150.077995\n", + " 108.890078\n", + " 106.135251\n", " \n", " \n", " \n", " 15\n", - " 2018-05-05 11:00:00+00:00\n", - " 186.08136\n", + " 2018-04-28 14:00:00+00:00\n", + " 149.738419\n", " 0.95\n", - " 140.706789\n", - " 231.455931\n", + " 161.696173\n", + " 137.780664\n", " \n", " \n", " \n", " 16\n", - " 2018-05-03 08:00:00+00:00\n", - " 675.380249\n", + " 2018-04-28 20:00:00+00:00\n", + " 71.378677\n", " 0.95\n", - " 532.913707\n", - " 817.846791\n", + " 98.940288\n", + " 43.817067\n", " \n", " \n", " \n", " 17\n", - " 2018-05-02 09:00:00+00:00\n", - " 537.494812\n", + " 2018-04-30 13:00:00+00:00\n", + " 139.673706\n", " 0.95\n", - " 376.406922\n", - " 698.582702\n", + " 66.493742\n", + " 212.85367\n", " \n", " \n", " \n", " 18\n", - " 2018-05-01 12:00:00+00:00\n", - " 101.637169\n", + " 2018-04-24 12:00:00+00:00\n", + " 144.577728\n", " 0.95\n", - " 55.141509\n", - " 148.132829\n", + " 120.01921\n", + " 169.136247\n", " \n", " \n", " \n", " 19\n", - " 2018-05-05 00:00:00+00:00\n", - " 7.469772\n", + " 2018-04-25 00:00:00+00:00\n", + " 54.215515\n", " 0.95\n", - " -23.930392\n", - " 38.869936\n", + " 46.8394\n", + " 61.591631\n", " \n", " \n", " \n", " 20\n", - " 2018-05-02 14:00:00+00:00\n", - " 153.851379\n", + " 2018-04-26 05:00:00+00:00\n", + " 8.140533\n", " 0.95\n", - " 104.224826\n", - " 203.477932\n", + " -14.613272\n", + " 30.894339\n", " \n", " \n", " \n", " 21\n", - " 2018-05-04 13:00:00+00:00\n", - " 162.676117\n", + " 2018-04-26 14:00:00+00:00\n", + " 198.744949\n", " 0.95\n", - " 113.098327\n", - " 212.253907\n", + " 174.982268\n", + " 222.50763\n", " \n", " \n", " \n", " 22\n", - " 2018-05-04 16:00:00+00:00\n", - " 330.643402\n", + " 2018-04-27 02:00:00+00:00\n", + " 9.91806\n", " 0.95\n", - " 205.125168\n", - " 456.161636\n", + " -26.749948\n", + " 46.586069\n", " \n", " \n", " \n", " 23\n", - " 2018-05-04 21:00:00+00:00\n", - " 136.264679\n", + " 2018-04-29 03:00:00+00:00\n", + " 32.063339\n", " 0.95\n", - " 41.947438\n", - " 230.58192\n", + " -35.730978\n", + " 99.857656\n", " \n", " \n", " \n", " 24\n", - " 2018-05-02 17:00:00+00:00\n", - " 675.527222\n", + " 2018-04-27 04:00:00+00:00\n", + " 25.757111\n", " 0.95\n", - " 516.358698\n", - " 834.695746\n", + " 8.178037\n", + " 43.336184\n", " \n", " \n", " \n", @@ -1482,86 +1459,86 @@ "[168 rows x 6 columns in total]" ], "text/plain": [ - " forecast_timestamp forecast_value confidence_level \\\n", - "0 2018-05-05 01:00:00+00:00 50.123672 0.95 \n", - "1 2018-05-05 07:00:00+00:00 103.112846 0.95 \n", - "2 2018-05-03 15:00:00+00:00 230.49147 0.95 \n", - "3 2018-05-02 08:00:00+00:00 737.477356 0.95 \n", - "4 2018-05-01 08:00:00+00:00 679.980469 0.95 \n", - "5 2018-05-06 18:00:00+00:00 136.80835 0.95 \n", - "6 2018-05-01 11:00:00+00:00 120.364288 0.95 \n", - "7 2018-05-06 22:00:00+00:00 64.722443 0.95 \n", - "8 2018-05-03 02:00:00+00:00 42.689804 0.95 \n", - "9 2018-05-07 17:00:00+00:00 594.999084 0.95 \n", - "10 2018-05-03 20:00:00+00:00 161.822281 0.95 \n", - "11 2018-05-01 20:00:00+00:00 173.801025 0.95 \n", - "12 2018-05-04 17:00:00+00:00 485.449829 0.95 \n", - "13 2018-05-04 09:00:00+00:00 418.055878 0.95 \n", - "14 2018-05-07 03:00:00+00:00 24.735134 0.95 \n", - "15 2018-05-05 11:00:00+00:00 186.08136 0.95 \n", - "16 2018-05-03 08:00:00+00:00 675.380249 0.95 \n", - "17 2018-05-02 09:00:00+00:00 537.494812 0.95 \n", - "18 2018-05-01 12:00:00+00:00 101.637169 0.95 \n", - "19 2018-05-05 00:00:00+00:00 7.469772 0.95 \n", - "20 2018-05-02 14:00:00+00:00 153.851379 0.95 \n", - "21 2018-05-04 13:00:00+00:00 162.676117 0.95 \n", - "22 2018-05-04 16:00:00+00:00 330.643402 0.95 \n", - "23 2018-05-04 21:00:00+00:00 136.264679 0.95 \n", - "24 2018-05-02 17:00:00+00:00 675.527222 0.95 \n", + " forecast_timestamp forecast_value confidence_level \\\n", + "2018-04-26 11:00:00+00:00 204.291275 0.95 \n", + "2018-04-27 13:00:00+00:00 196.034332 0.95 \n", + "2018-04-27 20:00:00+00:00 133.339386 0.95 \n", + "2018-04-28 05:00:00+00:00 -27.321686 0.95 \n", + "2018-04-29 12:00:00+00:00 117.657822 0.95 \n", + "2018-04-24 10:00:00+00:00 221.464111 0.95 \n", + "2018-04-24 23:00:00+00:00 56.203827 0.95 \n", + "2018-04-29 07:00:00+00:00 -14.801514 0.95 \n", + "2018-04-24 22:00:00+00:00 58.174316 0.95 \n", + "2018-04-25 08:00:00+00:00 666.577393 0.95 \n", + "2018-04-29 01:00:00+00:00 40.19632 0.95 \n", + "2018-04-29 02:00:00+00:00 29.00975 0.95 \n", + "2018-04-30 18:00:00+00:00 488.885284 0.95 \n", + "2018-04-27 10:00:00+00:00 188.79628 0.95 \n", + "2018-04-24 21:00:00+00:00 107.512665 0.95 \n", + "2018-04-28 14:00:00+00:00 149.738419 0.95 \n", + "2018-04-28 20:00:00+00:00 71.378677 0.95 \n", + "2018-04-30 13:00:00+00:00 139.673706 0.95 \n", + "2018-04-24 12:00:00+00:00 144.577728 0.95 \n", + "2018-04-25 00:00:00+00:00 54.215515 0.95 \n", + "2018-04-26 05:00:00+00:00 8.140533 0.95 \n", + "2018-04-26 14:00:00+00:00 198.744949 0.95 \n", + "2018-04-27 02:00:00+00:00 9.91806 0.95 \n", + "2018-04-29 03:00:00+00:00 32.063339 0.95 \n", + "2018-04-27 04:00:00+00:00 25.757111 0.95 \n", "\n", - " prediction_interval_lower_bound prediction_interval_upper_bound \\\n", - "0 -13.062586 113.309931 \n", - "1 33.725954 172.499739 \n", - "2 152.635986 308.346954 \n", - "3 562.979208 911.975504 \n", - "4 479.980134 879.980803 \n", - "5 -13.813863 287.430562 \n", - "6 52.778249 187.950328 \n", - "7 -55.555842 185.000727 \n", - "8 33.258414 52.121194 \n", - "9 346.917217 843.080952 \n", - "10 100.005942 223.63862 \n", - "11 56.460376 291.141675 \n", - "12 356.038539 614.86112 \n", - "13 281.134736 554.977019 \n", - "14 -100.607727 150.077995 \n", - "15 140.706789 231.455931 \n", - "16 532.913707 817.846791 \n", - "17 376.406922 698.582702 \n", - "18 55.141509 148.132829 \n", - "19 -23.930392 38.869936 \n", - "20 104.224826 203.477932 \n", - "21 113.098327 212.253907 \n", - "22 205.125168 456.161636 \n", - "23 41.947438 230.58192 \n", - "24 516.358698 834.695746 \n", + " prediction_interval_lower_bound prediction_interval_upper_bound \\\n", + " 149.151441 259.431109 \n", + " 203.125978 188.942686 \n", + " 132.658946 134.019826 \n", + " -13.918083 -40.725288 \n", + " 58.020439 177.295205 \n", + " 154.598621 288.329602 \n", + " 42.096868 70.310786 \n", + " -48.905982 19.302954 \n", + " 85.290985 31.057648 \n", + " 518.655663 814.499122 \n", + " 48.957491 31.435148 \n", + " -8.137303 66.156804 \n", + " 315.531321 662.239248 \n", + " 157.126395 220.466165 \n", + " 108.890078 106.135251 \n", + " 161.696173 137.780664 \n", + " 98.940288 43.817067 \n", + " 66.493742 212.85367 \n", + " 120.01921 169.136247 \n", + " 46.8394 61.591631 \n", + " -14.613272 30.894339 \n", + " 174.982268 222.50763 \n", + " -26.749948 46.586069 \n", + " -35.730978 99.857656 \n", + " 8.178037 43.336184 \n", "\n", - " ai_forecast_status \n", - "0 \n", - "1 \n", - "2 \n", - "3 \n", - "4 \n", - "5 \n", - "6 \n", - "7 \n", - "8 \n", - "9 \n", - "10 \n", - "11 \n", - "12 \n", - "13 \n", - "14 \n", - "15 \n", - "16 \n", - "17 \n", - "18 \n", - "19 \n", - "20 \n", - "21 \n", - "22 \n", - "23 \n", - "24 \n", + "ai_forecast_status \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", "...\n", "\n", "[168 rows x 6 columns]" @@ -1573,7 +1550,8 @@ } ], "source": [ - "result = df_grouped.ai.forecast(timestamp_column=\"trip_hour\", data_column=\"num_trips\", horizon=168) # 1 week\n", + "# Using all the data except the last week (2842-168) for training. And predict the last week (168).\n", + "result = df_grouped.head(2842-168).ai.forecast(timestamp_column=\"trip_hour\", data_column=\"num_trips\", horizon=168) \n", "result" ] }, @@ -1597,6 +1575,13 @@ "df_all = df_all.tail(672) # 4 weeks" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Plot a line chart and compare with the actual result." + ] + }, { "cell_type": "code", "execution_count": 8, @@ -1614,7 +1599,7 @@ }, { "data": { - "image/png": "", + "image/png": "", "text/plain": [ "
" ] diff --git a/notebooks/generative_ai/bq_dataframes_llm_code_generation.ipynb b/notebooks/generative_ai/bq_dataframes_llm_code_generation.ipynb index edb864613c..68e10cb5ed 100644 --- a/notebooks/generative_ai/bq_dataframes_llm_code_generation.ipynb +++ b/notebooks/generative_ai/bq_dataframes_llm_code_generation.ipynb @@ -35,12 +35,12 @@ "\n", " \n", " \n", - " \"Colab Run in Colab\n", + " \"Colab Run in Colab\n", " \n", " \n", " \n", " \n", - " \"GitHub\n", + " \"GitHub\n", " View on GitHub\n", " \n", " \n", diff --git a/notebooks/generative_ai/bq_dataframes_llm_kmeans.ipynb b/notebooks/generative_ai/bq_dataframes_llm_kmeans.ipynb index 9b05e1ab02..bc55096942 100644 --- a/notebooks/generative_ai/bq_dataframes_llm_kmeans.ipynb +++ b/notebooks/generative_ai/bq_dataframes_llm_kmeans.ipynb @@ -32,12 +32,12 @@ "\n", " \n", " \n", - " \"Colab Run in Colab\n", + " \"Colab Run in Colab\n", " \n", " \n", " \n", " \n", - " \"GitHub\n", + " \"GitHub\n", " View on GitHub\n", " \n", " \n", diff --git a/notebooks/generative_ai/bq_dataframes_llm_output_schema.ipynb b/notebooks/generative_ai/bq_dataframes_llm_output_schema.ipynb index 04ea0571df..70714c823c 100644 --- a/notebooks/generative_ai/bq_dataframes_llm_output_schema.ipynb +++ b/notebooks/generative_ai/bq_dataframes_llm_output_schema.ipynb @@ -31,12 +31,12 @@ "\n", " \n", " \n", - " \"Colab Run in Colab\n", + " \"Colab Run in Colab\n", " \n", " \n", " \n", " \n", - " \"GitHub\n", + " \"GitHub\n", " View on GitHub\n", " \n", " \n", diff --git a/notebooks/generative_ai/bq_dataframes_llm_vector_search.ipynb b/notebooks/generative_ai/bq_dataframes_llm_vector_search.ipynb index 15929fd666..b964117b67 100644 --- a/notebooks/generative_ai/bq_dataframes_llm_vector_search.ipynb +++ b/notebooks/generative_ai/bq_dataframes_llm_vector_search.ipynb @@ -35,12 +35,12 @@ "\n", " \n", " \n", - " \"Colab Run in Colab\n", + " \"Colab Run in Colab\n", " \n", " \n", " \n", " \n", - " \"GitHub\n", + " \"GitHub\n", " View on GitHub\n", " \n", " \n", diff --git a/notebooks/generative_ai/bq_dataframes_ml_drug_name_generation.ipynb b/notebooks/generative_ai/bq_dataframes_ml_drug_name_generation.ipynb index 413e473c2f..3220bbf6cd 100644 --- a/notebooks/generative_ai/bq_dataframes_ml_drug_name_generation.ipynb +++ b/notebooks/generative_ai/bq_dataframes_ml_drug_name_generation.ipynb @@ -34,12 +34,12 @@ "\n", " \n", " \n", diff --git a/notebooks/getting_started/bq_dataframes_template.ipynb b/notebooks/getting_started/bq_dataframes_template.ipynb index ae772d035e..e8002fd611 100644 --- a/notebooks/getting_started/bq_dataframes_template.ipynb +++ b/notebooks/getting_started/bq_dataframes_template.ipynb @@ -35,12 +35,12 @@ "\n", " \n", " \n", diff --git a/notebooks/getting_started/getting_started_bq_dataframes.ipynb b/notebooks/getting_started/getting_started_bq_dataframes.ipynb index ccecd09cb9..384f3b9c10 100644 --- a/notebooks/getting_started/getting_started_bq_dataframes.ipynb +++ b/notebooks/getting_started/getting_started_bq_dataframes.ipynb @@ -35,12 +35,12 @@ "\n", " \n", " \n", @@ -1658,7 +1658,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.10" + "version": "3.10.15" } }, "nbformat": 4, diff --git a/notebooks/getting_started/ml_fundamentals_bq_dataframes.ipynb b/notebooks/getting_started/ml_fundamentals_bq_dataframes.ipynb index d95447f7e5..3370e94713 100644 --- a/notebooks/getting_started/ml_fundamentals_bq_dataframes.ipynb +++ b/notebooks/getting_started/ml_fundamentals_bq_dataframes.ipynb @@ -35,12 +35,12 @@ "\n", " \n", " \n", diff --git a/notebooks/ml/bq_dataframes_ml_linear_regression.ipynb b/notebooks/ml/bq_dataframes_ml_linear_regression.ipynb index 4123dd0e1c..00aa7a347c 100644 --- a/notebooks/ml/bq_dataframes_ml_linear_regression.ipynb +++ b/notebooks/ml/bq_dataframes_ml_linear_regression.ipynb @@ -36,12 +36,12 @@ "\n", " \n", " \n", diff --git a/notebooks/ml/bq_dataframes_ml_linear_regression_big.ipynb b/notebooks/ml/bq_dataframes_ml_linear_regression_big.ipynb index 0c5106f8f4..5c016f9157 100644 --- a/notebooks/ml/bq_dataframes_ml_linear_regression_big.ipynb +++ b/notebooks/ml/bq_dataframes_ml_linear_regression_big.ipynb @@ -36,12 +36,12 @@ "\n", " \n", " \n", diff --git a/notebooks/multimodal/multimodal_dataframe.ipynb b/notebooks/multimodal/multimodal_dataframe.ipynb index fbe074b0d0..f6f80b0009 100644 --- a/notebooks/multimodal/multimodal_dataframe.ipynb +++ b/notebooks/multimodal/multimodal_dataframe.ipynb @@ -33,12 +33,12 @@ "\n", " \n", " \n", diff --git a/notebooks/remote_functions/remote_function_vertex_claude_model.ipynb b/notebooks/remote_functions/remote_function_vertex_claude_model.ipynb index 605f879bc7..9792c90205 100644 --- a/notebooks/remote_functions/remote_function_vertex_claude_model.ipynb +++ b/notebooks/remote_functions/remote_function_vertex_claude_model.ipynb @@ -10,12 +10,12 @@ "\n", " \n", " \n", diff --git a/notebooks/visualization/bq_dataframes_covid_line_graphs.ipynb b/notebooks/visualization/bq_dataframes_covid_line_graphs.ipynb index b98589c2ae..f0dd5eb678 100644 --- a/notebooks/visualization/bq_dataframes_covid_line_graphs.ipynb +++ b/notebooks/visualization/bq_dataframes_covid_line_graphs.ipynb @@ -35,12 +35,12 @@ "\n", " \n", " \n", diff --git a/notebooks/visualization/tutorial.ipynb b/notebooks/visualization/tutorial.ipynb index 96aff12452..0923e03bc7 100644 --- a/notebooks/visualization/tutorial.ipynb +++ b/notebooks/visualization/tutorial.ipynb @@ -33,12 +33,12 @@ "\n", " \n", " \n", diff --git a/noxfile.py b/noxfile.py index a1e8e5b99b..96b59d6776 100644 --- a/noxfile.py +++ b/noxfile.py @@ -53,6 +53,7 @@ LINT_PATHS = [ "docs", "bigframes", + "scripts", "tests", "third_party", "noxfile.py", @@ -107,7 +108,7 @@ SYSTEM_TEST_EXTRAS: List[str] = [] SYSTEM_TEST_EXTRAS_BY_PYTHON: Dict[str, List[str]] = { "3.9": ["tests", "anywidget"], - "3.10": ["tests"], + "3.10": ["tests", "polars"], "3.12": ["tests", "scikit-learn", "polars", "anywidget"], "3.13": ["tests", "polars"], } @@ -275,6 +276,7 @@ def mypy(session): "types-requests", "types-setuptools", "types-tabulate", + "types-PyYAML", "polars", "anywidget", ] diff --git a/samples/snippets/data_visualization_test.py b/samples/snippets/data_visualization_test.py new file mode 100644 index 0000000000..64cbbe0511 --- /dev/null +++ b/samples/snippets/data_visualization_test.py @@ -0,0 +1,149 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (t +# you may not use this file except in compliance wi +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in +# distributed under the License is distributed on a +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, eit +# See the License for the specific language governi +# limitations under the License. + + +def test_data_visualization() -> None: + # [START bigquery_dataframes_data_visualization_penguin_histogram] + import bigframes.pandas as bpd + + penguins = bpd.read_gbq("bigquery-public-data.ml_datasets.penguins") + penguins["culmen_depth_mm"].plot.hist(bins=40) + # [END bigquery_dataframes_data_visualization_penguin_histogram] + + # [START bigquery_dataframes_data_visualization_noaa_line_chart] + import bigframes.pandas as bpd + + noaa_surface = bpd.read_gbq("bigquery-public-data.noaa_gsod.gsod2021") + + # Calculate median temperature for each day + noaa_surface_median_temps = noaa_surface[["date", "temp"]].groupby("date").median() + + noaa_surface_median_temps.plot.line() + # [END bigquery_dataframes_data_visualization_noaa_line_chart] + + # [START bigquery_dataframes_data_visualization_usa_names_area_chart] + import bigframes.pandas as bpd + + usa_names = bpd.read_gbq("bigquery-public-data.usa_names.usa_1910_2013") + + # Count the occurences of the target names each year. The result is a dataframe with a multi-index. + name_counts = ( + usa_names[usa_names["name"].isin(("Mary", "Emily", "Lisa"))] + .groupby(("year", "name"))["number"] + .sum() + ) + + # Flatten the index of the dataframe so that the counts for each name has their own columns. + name_counts = name_counts.unstack(level=1).fillna(0) + + name_counts.plot.area(stacked=False, alpha=0.5) + # [END bigquery_dataframes_data_visualization_usa_names_area_chart] + + # [START bigquery_dataframes_data_visualization_penguin_bar_chart] + import bigframes.pandas as bpd + + penguins = bpd.read_gbq("bigquery-public-data.ml_datasets.penguins") + + penguin_count_by_sex = ( + penguins[penguins["sex"].isin(("MALE", "FEMALE"))] + .groupby("sex")["species"] + .count() + ) + penguin_count_by_sex.plot.bar() + # [END bigquery_dataframes_data_visualization_penguin_bar_chart] + + # [START bigquery_dataframes_data_visualization_taxi_scatter_plot] + import bigframes.pandas as bpd + + taxi_trips = bpd.read_gbq( + "bigquery-public-data.new_york_taxi_trips.tlc_yellow_trips_2021" + ).dropna() + + # Data Cleaning + taxi_trips = taxi_trips[ + taxi_trips["trip_distance"].between(0, 10, inclusive="right") + ] + taxi_trips = taxi_trips[taxi_trips["fare_amount"].between(0, 50, inclusive="right")] + + # If you are using partial ordering mode, you will also need to assign an order to your dataset. + # Otherwise, the next line can be skipped. + taxi_trips = taxi_trips.sort_values("pickup_datetime") + + taxi_trips.plot.scatter(x="trip_distance", y="fare_amount", alpha=0.5) + # [END bigquery_dataframes_data_visualization_taxi_scatter_plot] + + # [START bigquery_dataframes_data_visualization_noaa_sampling_n] + import bigframes.pandas as bpd + + noaa_surface = bpd.read_gbq("bigquery-public-data.noaa_gsod.gsod2021") + + # Calculate median temperature for each day + noaa_surface_median_temps = noaa_surface[["date", "temp"]].groupby("date").median() + + noaa_surface_median_temps.plot.line(sampling_n=40) + # [END bigquery_dataframes_data_visualization_noaa_sampling_n] + + # [START bigquery_dataframes_data_visualization_usa_names_subplots] + import bigframes.pandas as bpd + + usa_names = bpd.read_gbq("bigquery-public-data.usa_names.usa_1910_2013") + + # Count the occurences of the target names each year. The result is a dataframe with a multi-index. + name_counts = ( + usa_names[usa_names["name"].isin(("Mary", "Emily", "Lisa"))] + .groupby(("year", "name"))["number"] + .sum() + ) + + # Flatten the index of the dataframe so that the counts for each name has their own columns. + name_counts = name_counts.unstack(level=1).fillna(0) + + name_counts.plot.area(subplots=True, alpha=0.5) + # [END bigquery_dataframes_data_visualization_usa_names_subplots] + + # [START bigquery_dataframes_data_visualization_taxi_scatter_multidimension] + import bigframes.pandas as bpd + + taxi_trips = bpd.read_gbq( + "bigquery-public-data.new_york_taxi_trips.tlc_yellow_trips_2021" + ).dropna() + + # Data Cleaning + taxi_trips = taxi_trips[ + taxi_trips["trip_distance"].between(0, 10, inclusive="right") + ] + taxi_trips = taxi_trips[taxi_trips["fare_amount"].between(0, 50, inclusive="right")] + + # If you are using partial ordering mode, you also need to assign an order to your dataset. + # Otherwise, the next line can be skipped. + taxi_trips = taxi_trips.sort_values("pickup_datetime") + + taxi_trips["passenger_count_scaled"] = taxi_trips["passenger_count"] * 30 + + taxi_trips.plot.scatter( + x="trip_distance", + xlabel="trip distance (miles)", + y="fare_amount", + ylabel="fare amount (usd)", + alpha=0.5, + s="passenger_count_scaled", + label="passenger_count", + c="tip_amount", + cmap="jet", + colorbar=True, + legend=True, + figsize=(15, 7), + sampling_n=1000, + ) + # [END bigquery_dataframes_data_visualization_taxi_scatter_multidimension] diff --git a/scripts/create_read_gbq_colab_benchmark_tables.py b/scripts/create_read_gbq_colab_benchmark_tables.py new file mode 100644 index 0000000000..63419bc660 --- /dev/null +++ b/scripts/create_read_gbq_colab_benchmark_tables.py @@ -0,0 +1,541 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import argparse +import base64 +import concurrent.futures +import datetime +import json +import math +import time +from typing import Any, Iterable, MutableSequence, Sequence + +from google.cloud import bigquery +import numpy as np + +# --- Input Data --- +# Generated by querying bigquery-magics usage. See internal issue b/420984164. +TABLE_STATS: dict[str, list[float]] = { + "percentile": [9, 19, 29, 39, 49, 59, 69, 79, 89, 99], + "materialized_or_scanned_bytes": [ + 0.0, + 0.0, + 4102.0, + 76901.0, + 351693.0, + 500000.0, + 500000.0, + 1320930.0, + 17486432.0, + 1919625975.0, + ], + "avg_row_bytes": [ + 0.00014346299635435792, + 0.005370969708923197, + 0.3692756731526246, + 4.079344721151818, + 7.5418, + 12.528863516404146, + 22.686258546389798, + 48.69689224091025, + 100.90817356205852, + 2020, + ], + "materialized_mb": [ + 0.0, + 0.0, + 0.004102, + 0.076901, + 0.351693, + 0.5, + 0.5, + 1.32093, + 17.486432, + 1919.625975, + ], +} + +BIGQUERY_DATA_TYPE_SIZES = { + "BOOL": 1, + "DATE": 8, + "FLOAT64": 8, + "INT64": 8, + "DATETIME": 8, + "TIMESTAMP": 8, + "TIME": 8, + "NUMERIC": 16, + # Flexible types. + # JSON base size is its content, BYTES/STRING have 2 byte overhead + content + "JSON": 0, + "BYTES": 2, + "STRING": 2, +} +FIXED_TYPES = [ + "BOOL", + "INT64", + "FLOAT64", + "NUMERIC", + "DATE", + "DATETIME", + "TIMESTAMP", + "TIME", +] +FLEXIBLE_TYPES = ["STRING", "BYTES", "JSON"] + +JSON_CHAR_LIST = list("abcdef") +STRING_CHAR_LIST = list("abcdefghijklmnopqrstuvwxyz0123456789") + +# --- Helper Functions --- + + +def get_bq_schema(target_row_size_bytes: int) -> Sequence[tuple[str, str, int | None]]: + """ + Determines the BigQuery table schema to match the target_row_size_bytes. + Prioritizes fixed-size types for diversity, then uses flexible types. + Returns a list of tuples: (column_name, type_name, length_for_flexible_type). + Length is None for fixed-size types. + """ + schema: MutableSequence[tuple[str, str, int | None]] = [] + current_size = 0 + col_idx = 0 + + for bq_type in FIXED_TYPES: + # For simplicity, we'll allow slight overage if only fixed fields are chosen. + if current_size >= target_row_size_bytes: + break + + type_size = BIGQUERY_DATA_TYPE_SIZES[bq_type] + schema.append((f"col_{bq_type.lower()}_{col_idx}", bq_type, None)) + current_size += type_size + col_idx += 1 + + # Use flexible-size types to fill remaining space + + # Attempt to add one of each flexible type if space allows + if current_size < target_row_size_bytes: + remaining_bytes_for_content = target_row_size_bytes - current_size + + # For simplicity, divide the remaing bytes evenly across the flexible + # columns. + target_size = int(math.ceil(remaining_bytes_for_content / len(FLEXIBLE_TYPES))) + + for bq_type in FLEXIBLE_TYPES: + base_cost = BIGQUERY_DATA_TYPE_SIZES[bq_type] + min_content_size = max(0, target_size - base_cost) + + schema.append( + (f"col_{bq_type.lower()}_{col_idx}", bq_type, min_content_size) + ) + current_size += base_cost + min_content_size + col_idx += 1 + + return schema + + +def generate_bool_batch( + num_rows: int, rng: np.random.Generator, content_length: int | None = None +) -> np.ndarray: + return rng.choice([True, False], size=num_rows) + + +def generate_int64_batch( + num_rows: int, rng: np.random.Generator, content_length: int | None = None +) -> np.ndarray: + return rng.integers(-(10**18), 10**18, size=num_rows, dtype=np.int64) + + +def generate_float64_batch( + num_rows: int, rng: np.random.Generator, content_length: int | None = None +) -> np.ndarray: + return rng.random(size=num_rows) * 2 * 10**10 - 10**10 + + +def generate_numeric_batch( + num_rows: int, rng: np.random.Generator, content_length: int | None = None +) -> np.ndarray: + raw_numerics = rng.random(size=num_rows) * 2 * 10**28 - 10**28 + format_numeric_vectorized = np.vectorize(lambda x: f"{x:.9f}") + return format_numeric_vectorized(raw_numerics) + + +def generate_date_batch( + num_rows: int, rng: np.random.Generator, content_length: int | None = None +) -> np.ndarray: + start_date_ord = datetime.date(1, 1, 1).toordinal() + max_days = (datetime.date(9999, 12, 31) - datetime.date(1, 1, 1)).days + day_offsets = rng.integers(0, max_days + 1, size=num_rows) + date_ordinals = start_date_ord + day_offsets + return np.array( + [ + datetime.date.fromordinal(int(ordinal)).isoformat() + for ordinal in date_ordinals + ] + ) + + +def generate_numpy_datetimes(num_rows: int, rng: np.random.Generator) -> np.ndarray: + # Generate seconds from a broad range (e.g., year 1 to 9999) + # Note: Python's datetime.timestamp() might be limited by system's C mktime. + # For broader range with np.datetime64, it's usually fine. + # Let's generate epoch seconds relative to Unix epoch for np.datetime64 compatibility + min_epoch_seconds = int( + datetime.datetime(1, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc).timestamp() + ) + # Max for datetime64[s] is far out, but let's bound it reasonably for BQ. + max_epoch_seconds = int( + datetime.datetime( + 9999, 12, 28, 23, 59, 59, tzinfo=datetime.timezone.utc + ).timestamp() + ) + + epoch_seconds = rng.integers( + min_epoch_seconds, + max_epoch_seconds + 1, + size=num_rows, + dtype=np.int64, + ) + microseconds_offset = rng.integers(0, 1000000, size=num_rows, dtype=np.int64) + + # Create datetime64[s] from epoch seconds and add microseconds as timedelta64[us] + np_timestamps_s = epoch_seconds.astype("datetime64[s]") + np_microseconds_td = microseconds_offset.astype("timedelta64[us]") + return np_timestamps_s + np_microseconds_td + + +def generate_datetime_batch( + num_rows: int, rng: np.random.Generator, content_length: int | None = None +) -> np.ndarray: + np_datetimes = generate_numpy_datetimes(num_rows, rng) + + # np.datetime_as_string produces 'YYYY-MM-DDTHH:MM:SS.ffffff' + # BQ DATETIME typically uses a space separator: 'YYYY-MM-DD HH:MM:SS.ffffff' + datetime_strings = np.datetime_as_string(np_datetimes, unit="us") + return np.array([s.replace("T", " ") for s in datetime_strings]) + + +def generate_timestamp_batch( + num_rows: int, rng: np.random.Generator, content_length: int | None = None +) -> np.ndarray: + np_datetimes = generate_numpy_datetimes(num_rows, rng) + + # Convert to string with UTC timezone indicator + # np.datetime_as_string with timezone='UTC' produces 'YYYY-MM-DDTHH:MM:SS.ffffffZ' + # BigQuery generally accepts this for TIMESTAMP. + return np.datetime_as_string(np_datetimes, unit="us", timezone="UTC") + + +def generate_time_batch( + num_rows: int, rng: np.random.Generator, content_length: int | None = None +) -> np.ndarray: + hours = rng.integers(0, 24, size=num_rows) + minutes = rng.integers(0, 60, size=num_rows) + seconds = rng.integers(0, 60, size=num_rows) + microseconds = rng.integers(0, 1000000, size=num_rows) + time_list = [ + datetime.time(hours[i], minutes[i], seconds[i], microseconds[i]).isoformat() + for i in range(num_rows) + ] + return np.array(time_list) + + +def generate_json_row(content_length: int, rng: np.random.Generator) -> str: + json_val_len = max(0, content_length - 5) + json_val_chars = rng.choice(JSON_CHAR_LIST, size=json_val_len) + json_obj = {"k": "".join(json_val_chars)} + return json.dumps(json_obj) + + +def generate_json_batch( + num_rows: int, rng: np.random.Generator, content_length: int | None = None +) -> np.ndarray: + content_length = content_length if content_length is not None else 10 + json_list = [ + generate_json_row(content_length=content_length, rng=rng) + for _ in range(num_rows) + ] + return np.array(json_list) + + +def generate_string_batch( + num_rows: int, rng: np.random.Generator, content_length: int | None = None +) -> np.ndarray: + content_length = content_length if content_length is not None else 1 + content_length = max(0, content_length) + chars_array = rng.choice(STRING_CHAR_LIST, size=(num_rows, content_length)) + return np.array(["".join(row_chars) for row_chars in chars_array]) + + +def generate_bytes_batch( + num_rows: int, rng: np.random.Generator, content_length: int | None = None +) -> np.ndarray: + content_length = content_length if content_length is not None else 1 + content_length = max(0, content_length) + return np.array( + [ + base64.b64encode(rng.bytes(content_length)).decode("utf-8") + for _ in range(num_rows) + ] + ) + + +BIGQUERY_DATA_TYPE_GENERATORS = { + "BOOL": generate_bool_batch, + "DATE": generate_date_batch, + "FLOAT64": generate_float64_batch, + "INT64": generate_int64_batch, + "DATETIME": generate_datetime_batch, + "TIMESTAMP": generate_timestamp_batch, + "TIME": generate_time_batch, + "NUMERIC": generate_numeric_batch, + "JSON": generate_json_batch, + "BYTES": generate_bytes_batch, + "STRING": generate_string_batch, +} + + +def generate_work_items( + table_id: str, + schema: Sequence[tuple[str, str, int | None]], + num_rows: int, + batch_size: int, +) -> Iterable[tuple[str, Sequence[tuple[str, str, int | None]], int]]: + """ + Generates work items of appropriate batch sizes. + """ + if num_rows == 0: + return + + generated_rows_total = 0 + + while generated_rows_total < num_rows: + current_batch_size = min(batch_size, num_rows - generated_rows_total) + if current_batch_size == 0: + break + + yield (table_id, schema, current_batch_size) + generated_rows_total += current_batch_size + + +def generate_batch( + schema: Sequence[tuple[str, str, int | None]], + num_rows: int, + rng: np.random.Generator, +) -> list[dict[str, Any]]: + col_names_ordered = [s[0] for s in schema] + + columns_data_batch = {} + for col_name, bq_type, length in schema: + generate_batch = BIGQUERY_DATA_TYPE_GENERATORS[bq_type] + columns_data_batch[col_name] = generate_batch( + num_rows, rng, content_length=length + ) + + # Turn numpy objects into Python objects. + # https://stackoverflow.com/a/32850511/101923 + columns_data_batch_json = {} + for column in columns_data_batch: + columns_data_batch_json[column] = columns_data_batch[column].tolist() + + # Assemble batch of rows + batch_data = [] + for i in range(num_rows): + row = { + col_name: columns_data_batch_json[col_name][i] + for col_name in col_names_ordered + } + batch_data.append(row) + + return batch_data + + +def generate_and_load_batch( + client: bigquery.Client, + table_id: str, + schema_def: Sequence[tuple[str, str, int | None]], + num_rows: int, + rng: np.random.Generator, +): + bq_schema = [] + for col_name, type_name, _ in schema_def: + bq_schema.append(bigquery.SchemaField(col_name, type_name)) + table = bigquery.Table(table_id, schema=bq_schema) + + generated_data_chunk = generate_batch(schema_def, num_rows, rng) + errors = client.insert_rows_json(table, generated_data_chunk) + if errors: + raise ValueError(f"Encountered errors while inserting sub-batch: {errors}") + + +def create_and_load_table( + client: bigquery.Client | None, + project_id: str, + dataset_id: str, + table_name: str, + schema_def: Sequence[tuple[str, str, int | None]], + num_rows: int, + executor: concurrent.futures.Executor, +): + """Creates a BigQuery table and loads data into it by consuming a data generator.""" + + if not client: + print(f"Simulating: Generated schema: {schema_def}") + return + + # BQ client library streaming insert batch size (rows per API call) + # This is different from data_gen_batch_size which is for generating data. + # We can make BQ_LOAD_BATCH_SIZE smaller than data_gen_batch_size if needed. + BQ_LOAD_BATCH_SIZE = 500 + + # Actual BigQuery operations occur here because both project_id and dataset_id are provided + print( + f"Attempting BigQuery operations for table {table_name} in project '{project_id}', dataset '{dataset_id}'." + ) + table_id = f"{project_id}.{dataset_id}.{table_name}" + + bq_schema = [] + for col_name, type_name, _ in schema_def: + bq_schema.append(bigquery.SchemaField(col_name, type_name)) + + table = bigquery.Table(table_id, schema=bq_schema) + print(f"(Re)creating table {table_id}...") + table = client.create_table(table, exists_ok=True) + print(f"Table {table_id} created successfully or already exists.") + + # Query in case there's something in the streaming buffer already. + table_rows = next( + iter(client.query_and_wait(f"SELECT COUNT(*) FROM `{table_id}`")) + )[0] + print(f"Table {table_id} has {table_rows} rows.") + num_rows = max(0, num_rows - table_rows) + + if num_rows <= 0: + print(f"No rows to load. Requested {num_rows} rows. Skipping.") + return + + print(f"Starting to load {num_rows} rows into {table_id} in batches...") + + previous_status_time = 0.0 + generated_rows_total = 0 + + for completed_rows in executor.map( + worker_process_item, + generate_work_items( + table_id, + schema_def, + num_rows, + BQ_LOAD_BATCH_SIZE, + ), + ): + generated_rows_total += completed_rows + + current_time = time.monotonic() + if current_time - previous_status_time > 5: + print(f"Wrote {generated_rows_total} out of {num_rows} rows.") + previous_status_time = current_time + + +worker_client: bigquery.Client | None = None +worker_rng: np.random.Generator | None = None + + +def worker_initializer(project_id: str | None): + global worker_client, worker_rng + + # One client per process, since multiprocessing and client connections don't + # play nicely together. + if project_id is not None: + worker_client = bigquery.Client(project=project_id) + + worker_rng = np.random.default_rng() + + +def worker_process_item( + work_item: tuple[str, Sequence[tuple[str, str, int | None]], int] +): + global worker_client, worker_rng + + if worker_client is None or worker_rng is None: + raise ValueError("Worker not initialized.") + + table_id, schema_def, num_rows = work_item + generate_and_load_batch(worker_client, table_id, schema_def, num_rows, worker_rng) + return num_rows + + +# --- Main Script Logic --- +def main(): + """Main function to create and populate BigQuery tables.""" + + parser = argparse.ArgumentParser( + description="Generate and load BigQuery benchmark tables." + ) + parser.add_argument( + "-p", + "--project_id", + type=str, + default=None, + help="Google Cloud Project ID. If not provided, script runs in simulation mode.", + ) + parser.add_argument( + "-d", + "--dataset_id", + type=str, + default=None, + help="BigQuery Dataset ID within the project. If not provided, script runs in simulation mode.", + ) + args = parser.parse_args() + + num_percentiles = len(TABLE_STATS["percentile"]) + client = None + + if args.project_id and args.dataset_id: + client = bigquery.Client(project=args.project_id) + dataset = bigquery.Dataset(f"{args.project_id}.{args.dataset_id}") + client.create_dataset(dataset, exists_ok=True) + + with concurrent.futures.ProcessPoolExecutor( + initializer=worker_initializer, initargs=(args.project_id,) + ) as executor: + for i in range(num_percentiles): + percentile = TABLE_STATS["percentile"][i] + avg_row_bytes_raw = TABLE_STATS["avg_row_bytes"][i] + table_bytes_raw = TABLE_STATS["materialized_or_scanned_bytes"][i] + + target_table_bytes = max(1, int(math.ceil(table_bytes_raw))) + target_row_bytes = max(1, int(math.ceil(avg_row_bytes_raw))) + num_rows = max(1, int(math.ceil(target_table_bytes / target_row_bytes))) + + table_name = f"percentile_{percentile:02d}" + print(f"\n--- Processing Table: {table_name} ---") + print(f"Target average row bytes (rounded up): {target_row_bytes}") + print(f"Number of rows (rounded up): {num_rows}") + + schema_definition = get_bq_schema(target_row_bytes) + print(f"Generated Schema: {schema_definition}") + + create_and_load_table( + client, + args.project_id or "", + args.dataset_id or "", + table_name, + schema_definition, + num_rows, + executor, + ) + + +if __name__ == "__main__": + main() diff --git a/scripts/create_read_gbq_colab_benchmark_tables_test.py b/scripts/create_read_gbq_colab_benchmark_tables_test.py new file mode 100644 index 0000000000..89c49e4243 --- /dev/null +++ b/scripts/create_read_gbq_colab_benchmark_tables_test.py @@ -0,0 +1,333 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import base64 +import datetime +import json +import math +import re + +# Assuming the script to be tested is in the same directory or accessible via PYTHONPATH +from create_read_gbq_colab_benchmark_tables import ( + BIGQUERY_DATA_TYPE_SIZES, + generate_batch, + generate_work_items, + get_bq_schema, +) +import numpy as np +import pytest + + +# Helper function to calculate estimated row size from schema +def _calculate_row_size(schema: list[tuple[str, str, int | None]]) -> int: + """Calculates the estimated byte size of a row based on the schema. + Note: This is a simplified calculation for testing and might not perfectly + match BigQuery's internal storage, especially for complex types or NULLs. + """ + size = 0 + for _, bq_type, length in schema: + if bq_type in ["STRING", "BYTES", "JSON"]: + # Base cost (e.g., 2 bytes) + content length + size += BIGQUERY_DATA_TYPE_SIZES[bq_type] + ( + length if length is not None else 0 + ) + elif bq_type in BIGQUERY_DATA_TYPE_SIZES: + size += BIGQUERY_DATA_TYPE_SIZES[bq_type] + else: + raise AssertionError(f"Got unexpected type {bq_type}") + return size + + +# --- Tests for get_bq_schema --- + + +def test_get_bq_schema_zero_bytes(): + assert get_bq_schema(0) == [] + + +def test_get_bq_schema_one_byte(): + schema = get_bq_schema(1) + + assert len(schema) == 1 + assert schema[0][1] == "BOOL" # ('col_bool_fallback_0', 'BOOL', None) or similar + assert _calculate_row_size(schema) == 1 + + +def test_get_bq_schema_exact_fixed_fit(): + # BOOL (1) + INT64 (8) = 9 bytes + target_size = 9 + schema = get_bq_schema(target_size) + + assert len(schema) == 2 + assert schema[0][1] == "BOOL" + assert schema[1][1] == "INT64" + assert _calculate_row_size(schema) == target_size + + +def test_get_bq_schema_needs_flexible_string(): + # Sum of all fixed types: + # BOOL 1, INT64 8, FLOAT64 8, NUMERIC 16, DATE 8, DATETIME 8, TIMESTAMP 8, TIME 8 + # Total = 1+8+8+16+8+8+8+8 = 65 + target_size = 65 + 1 + schema = get_bq_schema(target_size) + + assert _calculate_row_size(schema) == 65 + 2 + 2 + 1 + + string_cols = [s for s in schema if s[1] == "STRING"] + assert len(string_cols) == 1 + assert string_cols[0][2] == 0 + + bytes_cols = [s for s in schema if s[1] == "BYTES"] + assert len(bytes_cols) == 1 + assert bytes_cols[0][2] == 0 + + json_cols = [s for s in schema if s[1] == "JSON"] + assert len(json_cols) == 1 + assert json_cols[0][2] == 1 + + +def test_get_bq_schema_flexible_expansion(): + # Sum of all fixed types: + # BOOL 1, INT64 8, FLOAT64 8, NUMERIC 16, DATE 8, DATETIME 8, TIMESTAMP 8, TIME 8 + # Total = 1+8+8+16+8+8+8+8 = 65 + target_size = 65 + 3 * 5 + schema = get_bq_schema(target_size) + + assert _calculate_row_size(schema) == target_size + + string_cols = [s for s in schema if s[1] == "STRING"] + assert len(string_cols) == 1 + assert string_cols[0][2] == 3 + + bytes_cols = [s for s in schema if s[1] == "BYTES"] + assert len(bytes_cols) == 1 + assert bytes_cols[0][2] == 3 + + json_cols = [s for s in schema if s[1] == "JSON"] + assert len(json_cols) == 1 + assert json_cols[0][2] == 5 + + +def test_get_bq_schema_all_fixed_types_possible(): + # Sum of all fixed types: + # BOOL 1, INT64 8, FLOAT64 8, NUMERIC 16, DATE 8, DATETIME 8, TIMESTAMP 8, TIME 8 + # Total = 1+8+8+16+8+8+8+8 = 65 + target_size = 65 + schema = get_bq_schema(target_size) + + expected_fixed_types = { + "BOOL", + "INT64", + "FLOAT64", + "NUMERIC", + "DATE", + "DATETIME", + "TIMESTAMP", + "TIME", + } + present_types = {s[1] for s in schema} + + assert expected_fixed_types.issubset(present_types) + + # Check if the size is close to target. + # All fixed (65) + calculated_size = _calculate_row_size(schema) + assert calculated_size == target_size + + +def test_get_bq_schema_uniqueness_of_column_names(): + target_size = 100 # A size that generates multiple columns + schema = get_bq_schema(target_size) + + column_names = [s[0] for s in schema] + assert len(column_names) == len(set(column_names)) + + +# --- Tests for generate_work_items --- + + +def test_generate_work_items_zero_rows(): + schema = [("col_int", "INT64", None)] + data_generator = generate_work_items( + "some_table", schema, num_rows=0, batch_size=10 + ) + + # Expect the generator to be exhausted + with pytest.raises(StopIteration): + next(data_generator) + + +def test_generate_work_items_basic_schema_and_batching(): + schema = [("id", "INT64", None), ("is_active", "BOOL", None)] + num_rows = 25 + batch_size = 10 + + generated_rows_count = 0 + batch_count = 0 + for work_item in generate_work_items("some_table", schema, num_rows, batch_size): + table_id, schema_def, num_rows_in_batch = work_item + assert table_id == "some_table" + assert schema_def == schema + assert num_rows_in_batch <= num_rows + assert num_rows_in_batch <= batch_size + batch_count += 1 + generated_rows_count += num_rows_in_batch + + assert generated_rows_count == num_rows + assert batch_count == math.ceil(num_rows / batch_size) # 25/10 = 2.5 -> 3 batches + + +def test_generate_work_items_batch_size_larger_than_num_rows(): + schema = [("value", "FLOAT64", None)] + num_rows = 5 + batch_size = 100 + + generated_rows_count = 0 + batch_count = 0 + for work_item in generate_work_items("some_table", schema, num_rows, batch_size): + table_id, schema_def, num_rows_in_batch = work_item + assert table_id == "some_table" + assert schema_def == schema + assert num_rows_in_batch == num_rows # Should be one batch with all rows + batch_count += 1 + generated_rows_count += num_rows_in_batch + + assert generated_rows_count == num_rows + assert batch_count == 1 + + +def test_generate_work_items_all_datatypes(rng): + schema = [ + ("c_bool", "BOOL", None), + ("c_int64", "INT64", None), + ("c_float64", "FLOAT64", None), + ("c_numeric", "NUMERIC", None), + ("c_date", "DATE", None), + ("c_datetime", "DATETIME", None), + ("c_timestamp", "TIMESTAMP", None), + ("c_time", "TIME", None), + ("c_string", "STRING", 10), + ("c_bytes", "BYTES", 5), + ("c_json", "JSON", 20), # Length for JSON is content hint + ] + num_rows = 3 + batch_size = 2 # To test multiple batches + + total_rows_processed = 0 + for work_item in generate_work_items("some_table", schema, num_rows, batch_size): + table_id, schema_def, num_rows_in_batch = work_item + assert table_id == "some_table" + assert schema_def == schema + assert num_rows_in_batch <= batch_size + assert num_rows_in_batch <= num_rows + + total_rows_processed += num_rows_in_batch + + assert total_rows_processed == num_rows + + +# --- Pytest Fixture for RNG --- +@pytest.fixture +def rng(): + return np.random.default_rng(seed=42) + + +def test_generate_batch_basic_schema(rng): + schema = [("id", "INT64", None), ("is_active", "BOOL", None)] + batch = generate_batch(schema, 5, rng) + + assert len(batch) == 5 + + for row in batch: + assert isinstance(row, dict) + assert "id" in row + assert "is_active" in row + assert isinstance(row["id"], int) + assert isinstance(row["is_active"], bool) + + +def test_generate_batch_all_datatypes(rng): + schema = [ + ("c_bool", "BOOL", None), + ("c_int64", "INT64", None), + ("c_float64", "FLOAT64", None), + ("c_numeric", "NUMERIC", None), + ("c_date", "DATE", None), + ("c_datetime", "DATETIME", None), + ("c_timestamp", "TIMESTAMP", None), + ("c_time", "TIME", None), + ("c_string", "STRING", 10), + ("c_bytes", "BYTES", 5), + ("c_json", "JSON", 20), # Length for JSON is content hint + ] + num_rows = 3 + + date_pattern = re.compile(r"^\d{4}-\d{2}-\d{2}$") + time_pattern = re.compile(r"^\d{2}:\d{2}:\d{2}(\.\d{1,6})?$") + # BQ DATETIME: YYYY-MM-DD HH:MM:SS.ffffff + datetime_pattern = re.compile(r"^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}(\.\d{1,6})?$") + # BQ TIMESTAMP (UTC 'Z'): YYYY-MM-DDTHH:MM:SS.ffffffZ + timestamp_pattern = re.compile( + r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(\.\d{1,6})?Z$" + ) + numeric_pattern = re.compile(r"^-?\d+\.\d{9}$") + + batch = generate_batch(schema, num_rows, rng) + assert len(batch) == num_rows + + for row in batch: + assert isinstance(row["c_bool"], bool) + assert isinstance(row["c_int64"], int) + assert isinstance(row["c_float64"], float) + + assert isinstance(row["c_numeric"], str) + assert numeric_pattern.match(row["c_numeric"]) + + assert isinstance(row["c_date"], str) + assert date_pattern.match(row["c_date"]) + datetime.date.fromisoformat(row["c_date"]) # Check parsable + + assert isinstance(row["c_datetime"], str) + assert datetime_pattern.match(row["c_datetime"]) + datetime.datetime.fromisoformat(row["c_datetime"]) # Check parsable + + assert isinstance(row["c_timestamp"], str) + assert timestamp_pattern.match(row["c_timestamp"]) + # datetime.fromisoformat can parse 'Z' if Python >= 3.11, or needs replace('Z', '+00:00') + dt_obj = datetime.datetime.fromisoformat( + row["c_timestamp"].replace("Z", "+00:00") + ) + assert dt_obj.tzinfo == datetime.timezone.utc + + assert isinstance(row["c_time"], str) + assert time_pattern.match(row["c_time"]) + datetime.time.fromisoformat(row["c_time"]) # Check parsable + + assert isinstance(row["c_string"], str) + assert len(row["c_string"]) == 10 + + c_bytes = base64.b64decode(row["c_bytes"]) + assert isinstance(c_bytes, bytes) + assert len(c_bytes) == 5 + + assert isinstance(row["c_json"], str) + try: + json.loads(row["c_json"]) # Check if it's valid JSON + except json.JSONDecodeError: + pytest.fail(f"Invalid JSON string generated: {row['c_json']}") + # Note: Exact length check for JSON is hard due to content variability and escaping. + # The 'length' parameter for JSON in schema is a hint for content size. + # We are primarily testing that it's valid JSON. diff --git a/scripts/readme-gen/readme_gen.py b/scripts/readme-gen/readme_gen.py index 8f5e248a0d..ceb1eada7c 100644 --- a/scripts/readme-gen/readme_gen.py +++ b/scripts/readme-gen/readme_gen.py @@ -24,7 +24,6 @@ import jinja2 import yaml - jinja_env = jinja2.Environment( trim_blocks=True, loader=jinja2.FileSystemLoader( diff --git a/testing/constraints-3.10.txt b/testing/constraints-3.10.txt index b11ab5a88d..12ad443aab 100644 --- a/testing/constraints-3.10.txt +++ b/testing/constraints-3.10.txt @@ -15,3 +15,4 @@ matplotlib==3.7.1 psutil==5.9.5 seaborn==0.13.1 traitlets==5.7.1 +polars==1.7.0 diff --git a/tests/benchmark/.gitignore b/tests/benchmark/.gitignore new file mode 100644 index 0000000000..f1bf042bf7 --- /dev/null +++ b/tests/benchmark/.gitignore @@ -0,0 +1,6 @@ +*.bytesprocessed +*.bq_exec_time_seconds +*.error +*.local_exec_time_seconds +*.query_char_count +*.slotmillis diff --git a/tests/benchmark/read_gbq_colab/aggregate_output.py b/tests/benchmark/read_gbq_colab/aggregate_output.py new file mode 100644 index 0000000000..dda4bf95a4 --- /dev/null +++ b/tests/benchmark/read_gbq_colab/aggregate_output.py @@ -0,0 +1,72 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import pathlib + +import benchmark.utils as utils + +import bigframes.session + +PAGE_SIZE = utils.READ_GBQ_COLAB_PAGE_SIZE + + +def aggregate_output( + *, project_id, dataset_id, table_id, session: bigframes.session.Session +): + # TODO(tswast): Support alternative query if table_id is a local DataFrame, + # e.g. "{local_inline}" or "{local_large}" + df = session._read_gbq_colab( + f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}" + ) + + # Simulate getting the first page, since we'll always do that first in the UI. + df.shape + next(iter(df.to_pandas_batches(page_size=PAGE_SIZE))) + + # To simulate very small rows that can only fit a boolean, + # some tables don't have an integer column. If an integer column is available, + # we prefer to group by that to get a more realistic number of groups. + group_column = "col_int64_1" + if group_column not in df.columns: + group_column = "col_bool_0" + + # Simulate the user aggregating by a column and visualizing those results + df_aggregated = ( + df.assign(rounded=df[group_column].astype("Int64").round(-9)) + .groupby("rounded") + .sum(numeric_only=True) + ) + + df_aggregated.shape + next(iter(df_aggregated.to_pandas_batches(page_size=PAGE_SIZE))) + + +if __name__ == "__main__": + ( + project_id, + dataset_id, + table_id, + session, + suffix, + ) = utils.get_configuration(include_table_id=True) + current_path = pathlib.Path(__file__).absolute() + + utils.get_execution_time( + aggregate_output, + current_path, + suffix, + project_id=project_id, + dataset_id=dataset_id, + table_id=table_id, + session=session, + ) diff --git a/tests/benchmark/read_gbq_colab/config.jsonl b/tests/benchmark/read_gbq_colab/config.jsonl new file mode 100644 index 0000000000..6f1ddf4a5f --- /dev/null +++ b/tests/benchmark/read_gbq_colab/config.jsonl @@ -0,0 +1,10 @@ +{"benchmark_suffix": "percentile_09", "project_id": "bigframes-dev-perf", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_09", "ordered": false} +{"benchmark_suffix": "percentile_19", "project_id": "bigframes-dev-perf", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_19", "ordered": false} +{"benchmark_suffix": "percentile_29", "project_id": "bigframes-dev-perf", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_29", "ordered": false} +{"benchmark_suffix": "percentile_39", "project_id": "bigframes-dev-perf", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_39", "ordered": false} +{"benchmark_suffix": "percentile_49", "project_id": "bigframes-dev-perf", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_49", "ordered": false} +{"benchmark_suffix": "percentile_59", "project_id": "bigframes-dev-perf", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_59", "ordered": false} +{"benchmark_suffix": "percentile_69", "project_id": "bigframes-dev-perf", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_69", "ordered": false} +{"benchmark_suffix": "percentile_79", "project_id": "bigframes-dev-perf", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_79", "ordered": false} +{"benchmark_suffix": "percentile_89", "project_id": "bigframes-dev-perf", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_89", "ordered": false} +{"benchmark_suffix": "percentile_99", "project_id": "bigframes-dev-perf", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_99", "ordered": false} diff --git a/tests/benchmark/read_gbq_colab/dry_run.py b/tests/benchmark/read_gbq_colab/dry_run.py new file mode 100644 index 0000000000..c2de1b7cc4 --- /dev/null +++ b/tests/benchmark/read_gbq_colab/dry_run.py @@ -0,0 +1,48 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import pathlib + +import benchmark.utils as utils + +import bigframes.session + + +def dry_run(*, project_id, dataset_id, table_id, session: bigframes.session.Session): + # TODO(tswast): Support alternative query if table_id is a local DataFrame, + # e.g. "{local_inline}" or "{local_large}" + session._read_gbq_colab( + f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}", + dry_run=True, + ) + + +if __name__ == "__main__": + ( + project_id, + dataset_id, + table_id, + session, + suffix, + ) = utils.get_configuration(include_table_id=True) + current_path = pathlib.Path(__file__).absolute() + + utils.get_execution_time( + dry_run, + current_path, + suffix, + project_id=project_id, + dataset_id=dataset_id, + table_id=table_id, + session=session, + ) diff --git a/tests/benchmark/read_gbq_colab/filter_output.py b/tests/benchmark/read_gbq_colab/filter_output.py new file mode 100644 index 0000000000..5e872bb727 --- /dev/null +++ b/tests/benchmark/read_gbq_colab/filter_output.py @@ -0,0 +1,68 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import pathlib + +import benchmark.utils as utils +import pytest + +import bigframes.session + +PAGE_SIZE = utils.READ_GBQ_COLAB_PAGE_SIZE + + +def filter_output( + *, project_id, dataset_id, table_id, session: bigframes.session.Session +): + # TODO(tswast): Support alternative query if table_id is a local DataFrame, + # e.g. "{local_inline}" or "{local_large}" + df = session._read_gbq_colab( + f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}" + ) + + # Simulate getting the first page, since we'll always do that first in the UI. + df.shape + next(iter(df.to_pandas_batches(page_size=PAGE_SIZE))) + + # Simulate the user filtering by a column and visualizing those results + df_filtered = df[df["col_bool_0"]] + rows, _ = df_filtered.shape + + # It's possible we don't have any pages at all, since we filtered out all + # matching rows. + if rows == 0: + with pytest.raises(StopIteration): + next(iter(df_filtered.to_pandas_batches(page_size=PAGE_SIZE))) + else: + next(iter(df_filtered.to_pandas_batches(page_size=PAGE_SIZE))) + + +if __name__ == "__main__": + ( + project_id, + dataset_id, + table_id, + session, + suffix, + ) = utils.get_configuration(include_table_id=True) + current_path = pathlib.Path(__file__).absolute() + + utils.get_execution_time( + filter_output, + current_path, + suffix, + project_id=project_id, + dataset_id=dataset_id, + table_id=table_id, + session=session, + ) diff --git a/tests/benchmark/read_gbq_colab/first_page.py b/tests/benchmark/read_gbq_colab/first_page.py new file mode 100644 index 0000000000..2df9990d22 --- /dev/null +++ b/tests/benchmark/read_gbq_colab/first_page.py @@ -0,0 +1,53 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import pathlib + +import benchmark.utils as utils + +import bigframes.session + +PAGE_SIZE = utils.READ_GBQ_COLAB_PAGE_SIZE + + +def first_page(*, project_id, dataset_id, table_id, session: bigframes.session.Session): + # TODO(tswast): Support alternative query if table_id is a local DataFrame, + # e.g. "{local_inline}" or "{local_large}" + df = session._read_gbq_colab( + f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}" + ) + + # Get number of rows (to calculate number of pages) and the first page. + df.shape + next(iter(df.to_pandas_batches(page_size=PAGE_SIZE))) + + +if __name__ == "__main__": + ( + project_id, + dataset_id, + table_id, + session, + suffix, + ) = utils.get_configuration(include_table_id=True) + current_path = pathlib.Path(__file__).absolute() + + utils.get_execution_time( + first_page, + current_path, + suffix, + project_id=project_id, + dataset_id=dataset_id, + table_id=table_id, + session=session, + ) diff --git a/tests/benchmark/read_gbq_colab/last_page.py b/tests/benchmark/read_gbq_colab/last_page.py new file mode 100644 index 0000000000..ad785a29e8 --- /dev/null +++ b/tests/benchmark/read_gbq_colab/last_page.py @@ -0,0 +1,54 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import pathlib + +import benchmark.utils as utils + +import bigframes.session + +PAGE_SIZE = utils.READ_GBQ_COLAB_PAGE_SIZE + + +def last_page(*, project_id, dataset_id, table_id, session: bigframes.session.Session): + # TODO(tswast): Support alternative query if table_id is a local DataFrame, + # e.g. "{local_inline}" or "{local_large}" + df = session._read_gbq_colab( + f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}" + ) + + # Get number of rows (to calculate number of pages) and then all pages. + df.shape + for _ in df.to_pandas_batches(page_size=PAGE_SIZE): + pass + + +if __name__ == "__main__": + ( + project_id, + dataset_id, + table_id, + session, + suffix, + ) = utils.get_configuration(include_table_id=True) + current_path = pathlib.Path(__file__).absolute() + + utils.get_execution_time( + last_page, + current_path, + suffix, + project_id=project_id, + dataset_id=dataset_id, + table_id=table_id, + session=session, + ) diff --git a/tests/benchmark/read_gbq_colab/sort_output.py b/tests/benchmark/read_gbq_colab/sort_output.py new file mode 100644 index 0000000000..997de5683d --- /dev/null +++ b/tests/benchmark/read_gbq_colab/sort_output.py @@ -0,0 +1,64 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import pathlib + +import benchmark.utils as utils + +import bigframes.session + +PAGE_SIZE = utils.READ_GBQ_COLAB_PAGE_SIZE + + +def sort_output( + *, project_id, dataset_id, table_id, session: bigframes.session.Session +): + # TODO(tswast): Support alternative query if table_id is a local DataFrame, + # e.g. "{local_inline}" or "{local_large}" + df = session._read_gbq_colab( + f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}" + ) + + # Simulate getting the first page, since we'll always do that first in the UI. + df.shape + next(iter(df.to_pandas_batches(page_size=PAGE_SIZE))) + + # Simulate the user sorting by a column and visualizing those results + sort_column = "col_int64_1" + if sort_column not in df.columns: + sort_column = "col_bool_0" + + df_sorted = df.sort_values(sort_column) + df_sorted.shape + next(iter(df_sorted.to_pandas_batches(page_size=PAGE_SIZE))) + + +if __name__ == "__main__": + ( + project_id, + dataset_id, + table_id, + session, + suffix, + ) = utils.get_configuration(include_table_id=True) + current_path = pathlib.Path(__file__).absolute() + + utils.get_execution_time( + sort_output, + current_path, + suffix, + project_id=project_id, + dataset_id=dataset_id, + table_id=table_id, + session=session, + ) diff --git a/tests/benchmark/utils.py b/tests/benchmark/utils.py index 887d54dba2..48357ddde7 100644 --- a/tests/benchmark/utils.py +++ b/tests/benchmark/utils.py @@ -17,6 +17,8 @@ import bigframes +READ_GBQ_COLAB_PAGE_SIZE = 100 + def get_configuration(include_table_id=False): parser = argparse.ArgumentParser() @@ -94,6 +96,7 @@ def _str_to_bool(value): def _initialize_session(ordered: bool): + # TODO(tswast): add a flag to enable the polars semi-executor. context = bigframes.BigQueryOptions( location="US", ordering_mode="strict" if ordered else "partial" ) diff --git a/tests/system/large/operations/test_ai.py b/tests/system/large/operations/test_ai.py index ded5e0b588..86b30d9c65 100644 --- a/tests/system/large/operations/test_ai.py +++ b/tests/system/large/operations/test_ai.py @@ -848,65 +848,6 @@ def test_sim_join_data_too_large_raises_error(session, text_embedding_generator) ) -@pytest.mark.parametrize( - "instruction", - [ - pytest.param( - "No column reference", - id="zero_column", - marks=pytest.mark.xfail(raises=ValueError), - ), - pytest.param( - "{Animals}", - id="non_existing_column", - marks=pytest.mark.xfail(raises=ValueError), - ), - pytest.param( - "{Animals} and {Animals}", - id="two_columns", - marks=pytest.mark.xfail(raises=NotImplementedError), - ), - pytest.param( - "{index}", - id="preserved", - marks=pytest.mark.xfail(raises=ValueError), - ), - ], -) -def test_top_k_invalid_instruction_raise_error(instruction, gemini_flash_model): - df = dataframe.DataFrame( - { - "Animals": ["Dog", "Cat", "Bird", "Horse"], - "ID": [1, 2, 3, 4], - "index": ["a", "b", "c", "d"], - } - ) - - with bigframes.option_context( - AI_OP_EXP_OPTION, - True, - THRESHOLD_OPTION, - 10, - ): - df.ai.top_k(instruction, model=gemini_flash_model, k=2) - - -def test_top_k_invalid_k_raise_error(gemini_flash_model): - df = dataframe.DataFrame({"Animals": ["Dog", "Cat", "Bird", "Horse"]}) - - with bigframes.option_context( - AI_OP_EXP_OPTION, - True, - THRESHOLD_OPTION, - 10, - ), pytest.raises(ValueError): - df.ai.top_k( - "{Animals} are more popular as pets", - gemini_flash_model, - k=0, - ) - - @patch("builtins.input", return_value="") def test_confirm_operation__below_threshold_do_not_confirm(mock_input): df = dataframe.DataFrame({}) diff --git a/tests/system/small/engines/test_aggregation.py b/tests/system/small/engines/test_aggregation.py new file mode 100644 index 0000000000..2c323a5f28 --- /dev/null +++ b/tests/system/small/engines/test_aggregation.py @@ -0,0 +1,82 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +from bigframes.core import array_value, expression, identifiers, nodes +import bigframes.operations.aggregations as agg_ops +from bigframes.session import polars_executor +from bigframes.testing.engine_utils import assert_equivalence_execution + +pytest.importorskip("polars") + +# Polars used as reference as its fast and local. Generally though, prefer gbq engine where they disagree. +REFERENCE_ENGINE = polars_executor.PolarsExecutor() + + +@pytest.mark.parametrize("engine", ["polars", "bq"], indirect=True) +def test_engines_aggregate_size( + scalars_array_value: array_value.ArrayValue, + engine, +): + node = nodes.AggregateNode( + scalars_array_value.node, + aggregations=( + ( + expression.NullaryAggregation(agg_ops.SizeOp()), + identifiers.ColumnId("size_op"), + ), + ( + expression.UnaryAggregation( + agg_ops.SizeUnaryOp(), expression.deref("string_col") + ), + identifiers.ColumnId("unary_size_op"), + ), + ), + ) + assert_equivalence_execution(node, REFERENCE_ENGINE, engine) + + +@pytest.mark.parametrize("engine", ["polars", "bq"], indirect=True) +@pytest.mark.parametrize( + "grouping_cols", + [ + ["bool_col"], + ["string_col", "int64_col"], + ["date_col"], + ["datetime_col"], + ["timestamp_col"], + ["bytes_col"], + ], +) +def test_engines_grouped_aggregate( + scalars_array_value: array_value.ArrayValue, engine, grouping_cols +): + node = nodes.AggregateNode( + scalars_array_value.node, + aggregations=( + ( + expression.NullaryAggregation(agg_ops.SizeOp()), + identifiers.ColumnId("size_op"), + ), + ( + expression.UnaryAggregation( + agg_ops.SizeUnaryOp(), expression.deref("string_col") + ), + identifiers.ColumnId("unary_size_op"), + ), + ), + by_column_ids=tuple(expression.deref(id) for id in grouping_cols), + ) + assert_equivalence_execution(node, REFERENCE_ENGINE, engine) diff --git a/tests/system/small/engines/test_comparison_ops.py b/tests/system/small/engines/test_comparison_ops.py new file mode 100644 index 0000000000..fefff93f58 --- /dev/null +++ b/tests/system/small/engines/test_comparison_ops.py @@ -0,0 +1,70 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import itertools + +import pytest + +from bigframes.core import array_value +import bigframes.operations as ops +from bigframes.session import polars_executor +from bigframes.testing.engine_utils import assert_equivalence_execution + +pytest.importorskip("polars") + +# Polars used as reference as its fast and local. Generally though, prefer gbq engine where they disagree. +REFERENCE_ENGINE = polars_executor.PolarsExecutor() + +# numeric domain + + +def apply_op_pairwise( + array: array_value.ArrayValue, op: ops.BinaryOp, excluded_cols=[] +) -> array_value.ArrayValue: + exprs = [] + for l_arg, r_arg in itertools.permutations(array.column_ids, 2): + if (l_arg in excluded_cols) or (r_arg in excluded_cols): + continue + try: + _ = op.output_type( + array.get_column_type(l_arg), array.get_column_type(r_arg) + ) + exprs.append(op.as_expr(l_arg, r_arg)) + except TypeError: + continue + assert len(exprs) > 0 + new_arr, _ = array.compute_values(exprs) + return new_arr + + +@pytest.mark.parametrize("engine", ["polars", "bq"], indirect=True) +@pytest.mark.parametrize( + "op", + [ + ops.eq_op, + ops.eq_null_match_op, + ops.ne_op, + ops.gt_op, + ops.lt_op, + ops.le_op, + ops.ge_op, + ], +) +def test_engines_project_comparison_op( + scalars_array_value: array_value.ArrayValue, engine, op +): + # exclude string cols as does not contain dates + # bool col actually doesn't work properly for bq engine + arr = apply_op_pairwise(scalars_array_value, op, excluded_cols=["string_col"]) + assert_equivalence_execution(arr.node, REFERENCE_ENGINE, engine) diff --git a/tests/system/small/engines/test_slicing.py b/tests/system/small/engines/test_slicing.py new file mode 100644 index 0000000000..7340ff145b --- /dev/null +++ b/tests/system/small/engines/test_slicing.py @@ -0,0 +1,56 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +from bigframes.core import array_value, nodes +from bigframes.session import polars_executor +from bigframes.testing.engine_utils import assert_equivalence_execution + +pytest.importorskip("polars") + +# Polars used as reference as its fast and local. Generally though, prefer gbq engine where they disagree. +REFERENCE_ENGINE = polars_executor.PolarsExecutor() + + +@pytest.mark.parametrize("engine", ["polars", "bq"], indirect=True) +@pytest.mark.parametrize( + ("start", "stop", "step"), + [ + (1, None, None), + (None, 4, None), + (None, None, 2), + (None, 50_000_000_000, 1), + (5, 4, None), + (3, None, 2), + (1, 7, 2), + (1, 7, 50_000_000_000), + (-1, -7, -2), + (None, -7, -2), + (-1, None, -2), + (-7, -1, 2), + (-7, -1, None), + (-7, 7, None), + (7, -7, -2), + ], +) +def test_engines_slice( + scalars_array_value: array_value.ArrayValue, + engine, + start, + stop, + step, +): + node = nodes.SliceNode(scalars_array_value.node, start, stop, step) + assert_equivalence_execution(node, REFERENCE_ENGINE, engine) diff --git a/tests/system/small/operations/test_ai.py b/tests/system/small/operations/test_ai.py index 771b7b47d3..d6ec3cacad 100644 --- a/tests/system/small/operations/test_ai.py +++ b/tests/system/small/operations/test_ai.py @@ -74,11 +74,6 @@ def predict(self, *args, **kwargs): {"search_column": None, "query": None, "top_k": None, "model": None}, id="search", ), - pytest.param( - bigframes.operations.ai.AIAccessor.top_k, - {"instruction": None, "model": None}, - id="top_k", - ), pytest.param( bigframes.operations.ai.AIAccessor.sim_join, {"other": None, "left_on": None, "right_on": None, "model": None}, @@ -247,25 +242,6 @@ def test_join(session): ) -def test_top_k(session): - df = dataframe.DataFrame({"col": ["A", "B"]}, session=session) - model = FakeGeminiTextGenerator( - dataframe.DataFrame( - {"ml_generate_text_llm_result": ["Document 1"]}, session=session - ), - ) - - with bigframes.option_context( - AI_OP_EXP_OPTION, - True, - THRESHOLD_OPTION, - 50, - ): - result = df.ai.top_k("top k of {col}", model, k=1).to_pandas() - - assert len(result) == 1 - - def test_forecast_default(time_series_df_default_index: dataframe.DataFrame): df = time_series_df_default_index[time_series_df_default_index["id"] == "1"] diff --git a/tests/system/small/pandas/io/__init__.py b/tests/system/small/pandas/io/__init__.py new file mode 100644 index 0000000000..0a2669d7a2 --- /dev/null +++ b/tests/system/small/pandas/io/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/system/small/pandas/io/api/__init__.py b/tests/system/small/pandas/io/api/__init__.py new file mode 100644 index 0000000000..0a2669d7a2 --- /dev/null +++ b/tests/system/small/pandas/io/api/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/system/small/pandas/io/api/test_read_gbq_colab.py b/tests/system/small/pandas/io/api/test_read_gbq_colab.py new file mode 100644 index 0000000000..6e848ed9ea --- /dev/null +++ b/tests/system/small/pandas/io/api/test_read_gbq_colab.py @@ -0,0 +1,329 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import datetime +import decimal + +import db_dtypes # type: ignore +import geopandas # type: ignore +import numpy +import pandas +import pyarrow +import pytest +import shapely.geometry # type: ignore + +from bigframes.pandas.io import api as module_under_test + + +@pytest.mark.parametrize( + ("df_pd",), + ( + # Regression tests for b/428190014. + # + # Test every BigQuery type we support, especially those where the legacy + # SQL type name differs from the GoogleSQL type name. + # + # See: + # https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types + # and compare to the legacy types at + # https://cloud.google.com/bigquery/docs/data-types + pytest.param( + pandas.DataFrame( + { + "ints": pandas.Series( + [[1], [2], [3]], + dtype=pandas.ArrowDtype(pyarrow.list_(pyarrow.int64())), + ), + "floats": pandas.Series( + [[1.0], [2.0], [3.0]], + dtype=pandas.ArrowDtype(pyarrow.list_(pyarrow.float64())), + ), + } + ), + id="arrays", + ), + pytest.param( + pandas.DataFrame( + { + "bool": pandas.Series([True, False, True], dtype="bool"), + "boolean": pandas.Series([True, None, True], dtype="boolean"), + "object": pandas.Series([True, None, True], dtype="object"), + "arrow": pandas.Series( + [True, None, True], dtype=pandas.ArrowDtype(pyarrow.bool_()) + ), + } + ), + id="bools", + ), + pytest.param( + pandas.DataFrame( + { + "bytes": pandas.Series([b"a", b"b", b"c"], dtype=numpy.bytes_), + "object": pandas.Series([b"a", None, b"c"], dtype="object"), + "arrow": pandas.Series( + [b"a", None, b"c"], dtype=pandas.ArrowDtype(pyarrow.binary()) + ), + } + ), + id="bytes", + ), + pytest.param( + pandas.DataFrame( + { + "object": pandas.Series( + [ + datetime.date(2023, 11, 23), + None, + datetime.date(1970, 1, 1), + ], + dtype="object", + ), + "arrow": pandas.Series( + [ + datetime.date(2023, 11, 23), + None, + datetime.date(1970, 1, 1), + ], + dtype=pandas.ArrowDtype(pyarrow.date32()), + ), + } + ), + id="dates", + ), + pytest.param( + pandas.DataFrame( + { + "object": pandas.Series( + [ + datetime.datetime(2023, 11, 23, 13, 14, 15), + None, + datetime.datetime(1970, 1, 1, 0, 0, 0), + ], + dtype="object", + ), + "datetime64": pandas.Series( + [ + datetime.datetime(2023, 11, 23, 13, 14, 15), + None, + datetime.datetime(1970, 1, 1, 0, 0, 0), + ], + dtype="datetime64[us]", + ), + "arrow": pandas.Series( + [ + datetime.datetime(2023, 11, 23, 13, 14, 15), + None, + datetime.datetime(1970, 1, 1, 0, 0, 0), + ], + dtype=pandas.ArrowDtype(pyarrow.timestamp("us")), + ), + } + ), + id="datetimes", + ), + pytest.param( + pandas.DataFrame( + { + "object": pandas.Series( + [ + shapely.geometry.Point(145.0, -37.8), + None, + shapely.geometry.Point(-122.3, 47.6), + ], + dtype="object", + ), + "geopandas": geopandas.GeoSeries( + [ + shapely.geometry.Point(145.0, -37.8), + None, + shapely.geometry.Point(-122.3, 47.6), + ] + ), + } + ), + id="geographys", + ), + # TODO(tswast): Add INTERVAL once BigFrames supports it. + pytest.param( + pandas.DataFrame( + { + # TODO(tswast): Is there an equivalent object type we can use here? + # TODO(tswast): Add built-in Arrow extension type + "db_dtypes": pandas.Series( + ["{}", None, "123"], + dtype=pandas.ArrowDtype(db_dtypes.JSONArrowType()), + ), + } + ), + id="jsons", + ), + pytest.param( + pandas.DataFrame( + { + "int64": pandas.Series([1, 2, 3], dtype="int64"), + "Int64": pandas.Series([1, None, 3], dtype="Int64"), + "object": pandas.Series([1, None, 3], dtype="object"), + "arrow": pandas.Series( + [1, None, 3], dtype=pandas.ArrowDtype(pyarrow.int64()) + ), + } + ), + id="ints", + ), + pytest.param( + pandas.DataFrame( + { + "object": pandas.Series( + [decimal.Decimal("1.23"), None, decimal.Decimal("4.56")], + dtype="object", + ), + "arrow": pandas.Series( + [decimal.Decimal("1.23"), None, decimal.Decimal("4.56")], + dtype=pandas.ArrowDtype(pyarrow.decimal128(38, 9)), + ), + } + ), + id="numerics", + ), + pytest.param( + pandas.DataFrame( + { + # TODO(tswast): Add object type for BIGNUMERIC. Can bigframes disambiguate? + "arrow": pandas.Series( + [decimal.Decimal("1.23"), None, decimal.Decimal("4.56")], + dtype=pandas.ArrowDtype(pyarrow.decimal256(76, 38)), + ), + } + ), + id="bignumerics", + ), + pytest.param( + pandas.DataFrame( + { + "float64": pandas.Series([1.23, None, 4.56], dtype="float64"), + "Float64": pandas.Series([1.23, None, 4.56], dtype="Float64"), + "object": pandas.Series([1.23, None, 4.56], dtype="object"), + "arrow": pandas.Series( + [1.23, None, 4.56], dtype=pandas.ArrowDtype(pyarrow.float64()) + ), + } + ), + id="floats", + ), + # TODO(tswast): Add RANGE once BigFrames supports it. + pytest.param( + pandas.DataFrame( + { + "string": pandas.Series(["a", "b", "c"], dtype="string[python]"), + "object": pandas.Series(["a", None, "c"], dtype="object"), + "arrow": pandas.Series(["a", None, "c"], dtype="string[pyarrow]"), + } + ), + id="strings", + ), + pytest.param( + pandas.DataFrame( + { + # TODO(tswast): Add object type for STRUCT? How to tell apart from JSON? + "arrow": pandas.Series( + [{"a": 1, "b": 1.0, "c": "c"}], + dtype=pandas.ArrowDtype( + pyarrow.struct( + [ + ("a", pyarrow.int64()), + ("b", pyarrow.float64()), + ("c", pyarrow.string()), + ] + ) + ), + ), + } + ), + id="structs", + ), + pytest.param( + pandas.DataFrame( + { + "object": pandas.Series( + [ + datetime.time(0, 0, 0), + None, + datetime.time(13, 7, 11), + ], + dtype="object", + ), + "arrow": pandas.Series( + [ + datetime.time(0, 0, 0), + None, + datetime.time(13, 7, 11), + ], + dtype=pandas.ArrowDtype(pyarrow.time64("us")), + ), + } + ), + id="times", + ), + pytest.param( + pandas.DataFrame( + { + "object": pandas.Series( + [ + datetime.datetime( + 2023, 11, 23, 13, 14, 15, tzinfo=datetime.timezone.utc + ), + None, + datetime.datetime( + 1970, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc + ), + ], + dtype="object", + ), + "datetime64": pandas.Series( + [ + datetime.datetime(2023, 11, 23, 13, 14, 15), + None, + datetime.datetime(1970, 1, 1, 0, 0, 0), + ], + dtype="datetime64[us]", + ).dt.tz_localize("UTC"), + "arrow": pandas.Series( + [ + datetime.datetime( + 2023, 11, 23, 13, 14, 15, tzinfo=datetime.timezone.utc + ), + None, + datetime.datetime( + 1970, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc + ), + ], + dtype=pandas.ArrowDtype(pyarrow.timestamp("us", "UTC")), + ), + } + ), + id="timestamps", + ), + ), +) +def test_read_gbq_colab_sessionless_dry_run_generates_valid_sql_for_local_dataframe( + df_pd: pandas.DataFrame, +): + # This method will fail with an exception if it receives invalid SQL. + result = module_under_test._run_read_gbq_colab_sessionless_dry_run( + query="SELECT * FROM {df_pd}", + pyformat_args={"df_pd": df_pd}, + ) + assert isinstance(result, pandas.Series) diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index b037c6f371..e8d156538f 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -3408,6 +3408,15 @@ def test__dir__with_rename(scalars_dfs): assert "drop" in results +def test_loc_select_columns_w_repeats(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index[["int64_col", "int64_col", "int64_too"]].to_pandas() + pd_result = scalars_pandas_df_index[["int64_col", "int64_col", "int64_too"]] + pd.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + @pytest.mark.parametrize( ("start", "stop", "step"), [ @@ -5529,7 +5538,7 @@ def test_astype_invalid_type_fail(scalars_dfs): bf_df.astype(123) -def test_agg_with_dict(scalars_dfs): +def test_agg_with_dict_lists(scalars_dfs): bf_df, pd_df = scalars_dfs agg_funcs = { "int64_too": ["min", "max"], @@ -5544,6 +5553,38 @@ def test_agg_with_dict(scalars_dfs): ) +def test_agg_with_dict_list_and_str(scalars_dfs): + bf_df, pd_df = scalars_dfs + agg_funcs = { + "int64_too": ["min", "max"], + "int64_col": "sum", + } + + bf_result = bf_df.agg(agg_funcs).to_pandas() + pd_result = pd_df.agg(agg_funcs) + + pd.testing.assert_frame_equal( + bf_result, pd_result, check_dtype=False, check_index_type=False + ) + + +def test_agg_with_dict_strs(scalars_dfs): + bf_df, pd_df = scalars_dfs + agg_funcs = { + "int64_too": "min", + "int64_col": "sum", + "float64_col": "max", + } + + bf_result = bf_df.agg(agg_funcs).to_pandas() + pd_result = pd_df.agg(agg_funcs) + pd_result.index = pd_result.index.astype("string[pyarrow]") + + pd.testing.assert_series_equal( + bf_result, pd_result, check_dtype=False, check_index_type=False + ) + + def test_agg_with_dict_containing_non_existing_col_raise_key_error(scalars_dfs): bf_df, _ = scalars_dfs agg_funcs = { diff --git a/tests/system/small/test_index.py b/tests/system/small/test_index.py index 3b9854be26..c7e316a9d2 100644 --- a/tests/system/small/test_index.py +++ b/tests/system/small/test_index.py @@ -499,3 +499,29 @@ def test_index_item_with_empty(session): with pytest.raises(ValueError, match=re.escape(expected_message)): bf_idx_empty.item() + + +@pytest.mark.parametrize( + ("key", "value"), + [ + (0, "string_value"), + (1, 42), + ("label", None), + (-1, 3.14), + ], +) +def test_index_setitem_different_types(scalars_dfs, key, value): + """Tests that custom Index setitem raises TypeError.""" + scalars_df, _ = scalars_dfs + index = scalars_df.index + + with pytest.raises(TypeError, match="Index does not support mutable operations"): + index[key] = value + + +def test_custom_index_setitem_error(): + """Tests that custom Index setitem raises TypeError.""" + custom_index = bpd.Index([1, 2, 3, 4, 5], name="custom") + + with pytest.raises(TypeError, match="Index does not support mutable operations"): + custom_index[2] = 999 diff --git a/tests/system/small/test_polars_execution.py b/tests/system/small/test_polars_execution.py new file mode 100644 index 0000000000..0aed693b80 --- /dev/null +++ b/tests/system/small/test_polars_execution.py @@ -0,0 +1,76 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import pytest + +import bigframes +from bigframes.testing.utils import assert_pandas_df_equal + +polars = pytest.importorskip("polars", reason="polars is required for this test") + + +@pytest.fixture(scope="module") +def session_w_polars(): + context = bigframes.BigQueryOptions(location="US", enable_polars_execution=True) + session = bigframes.Session(context=context) + yield session + session.close() # close generated session at cleanup time + + +def test_polar_execution_sorted(session_w_polars, scalars_pandas_df_index): + execution_count_before = session_w_polars._metrics.execution_count + bf_df = session_w_polars.read_pandas(scalars_pandas_df_index) + + pd_result = scalars_pandas_df_index.sort_index(ascending=False)[ + ["int64_too", "bool_col"] + ] + bf_result = bf_df.sort_index(ascending=False)[["int64_too", "bool_col"]].to_pandas() + + assert session_w_polars._metrics.execution_count == execution_count_before + assert_pandas_df_equal(bf_result, pd_result) + + +def test_polar_execution_sorted_filtered(session_w_polars, scalars_pandas_df_index): + execution_count_before = session_w_polars._metrics.execution_count + bf_df = session_w_polars.read_pandas(scalars_pandas_df_index) + + pd_result = scalars_pandas_df_index.sort_index(ascending=False).dropna( + subset=["int64_col", "string_col"] + ) + bf_result = ( + bf_df.sort_index(ascending=False) + .dropna(subset=["int64_col", "string_col"]) + .to_pandas() + ) + + # Filter and isnull not supported by polar engine yet, so falls back to bq execution + assert session_w_polars._metrics.execution_count == (execution_count_before + 1) + assert_pandas_df_equal(bf_result, pd_result) + + +def test_polar_execution_unsupported_sql_fallback( + session_w_polars, scalars_pandas_df_index +): + execution_count_before = session_w_polars._metrics.execution_count + bf_df = session_w_polars.read_pandas(scalars_pandas_df_index) + + pd_df = scalars_pandas_df_index.copy() + pd_df["str_len_col"] = pd_df.string_col.str.len() + pd_result = pd_df + + bf_df["str_len_col"] = bf_df.string_col.str.len() + bf_result = bf_df.to_pandas() + + # str len not supported by polar engine yet, so falls back to bq execution + assert session_w_polars._metrics.execution_count == (execution_count_before + 1) + assert_pandas_df_equal(bf_result, pd_result) diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 6760d63a20..d513b0e780 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -510,6 +510,69 @@ def test_series___getitem___with_default_index(scalars_dfs): assert bf_result == pd_result +@pytest.mark.parametrize( + ("index_col", "key", "value"), + ( + ("int64_too", 2, "new_string_value"), + ("string_col", "Hello, World!", "updated_value"), + ("int64_too", 0, None), + ), +) +def test_series___setitem__(scalars_dfs, index_col, key, value): + col_name = "string_col" + scalars_df, scalars_pandas_df = scalars_dfs + scalars_df = scalars_df.set_index(index_col, drop=False) + scalars_pandas_df = scalars_pandas_df.set_index(index_col, drop=False) + + bf_series = scalars_df[col_name] + pd_series = scalars_pandas_df[col_name].copy() + + bf_series[key] = value + pd_series[key] = value + + pd.testing.assert_series_equal(bf_series.to_pandas(), pd_series) + + +@pytest.mark.parametrize( + ("key", "value"), + ( + (0, 999), + (1, 888), + (0, None), + (-2345, 777), + ), +) +def test_series___setitem___with_int_key_numeric(scalars_dfs, key, value): + col_name = "int64_col" + index_col = "int64_too" + scalars_df, scalars_pandas_df = scalars_dfs + scalars_df = scalars_df.set_index(index_col, drop=False) + scalars_pandas_df = scalars_pandas_df.set_index(index_col, drop=False) + + bf_series = scalars_df[col_name] + pd_series = scalars_pandas_df[col_name].copy() + + bf_series[key] = value + pd_series[key] = value + + pd.testing.assert_series_equal(bf_series.to_pandas(), pd_series) + + +def test_series___setitem___with_default_index(scalars_dfs): + col_name = "float64_col" + key = 2 + value = 123.456 + scalars_df, scalars_pandas_df = scalars_dfs + + bf_series = scalars_df[col_name] + pd_series = scalars_pandas_df[col_name].copy() + + bf_series[key] = value + pd_series[key] = value + + assert bf_series.to_pandas().iloc[key] == pd_series.iloc[key] + + @pytest.mark.parametrize( ("col_name",), ( diff --git a/tests/unit/core/compile/sqlglot/conftest.py b/tests/unit/core/compile/sqlglot/conftest.py index 20f8159f01..645daddd46 100644 --- a/tests/unit/core/compile/sqlglot/conftest.py +++ b/tests/unit/core/compile/sqlglot/conftest.py @@ -29,18 +29,16 @@ DATA_DIR = CURRENT_DIR.parent.parent.parent.parent / "data" -@pytest.fixture(scope="session") -def compiler_session(scalars_types_table_schema): +def _create_compiler_session(table_name, table_schema): + """Helper function to create a compiler session.""" from bigframes.testing import compiler_session - # TODO: Check if ordering mode is needed for the tests. - table_name = "scalar_types" anonymous_dataset = bigquery.DatasetReference.from_string( "bigframes-dev.sqlglot_test" ) session = mocks.create_bigquery_session( table_name=table_name, - table_schema=scalars_types_table_schema, + table_schema=table_schema, anonymous_dataset=anonymous_dataset, ) session._executor = compiler_session.SQLCompilerExecutor() @@ -48,7 +46,33 @@ def compiler_session(scalars_types_table_schema): @pytest.fixture(scope="session") -def scalars_types_table_schema() -> typing.Sequence[bigquery.SchemaField]: +def compiler_session(scalar_types_table_schema): + """Compiler session for scalar types.""" + return _create_compiler_session("scalar_types", scalar_types_table_schema) + + +@pytest.fixture(scope="session") +def compiler_session_w_repeated_types(repeated_types_table_schema): + """Compiler session for repeated data types.""" + return _create_compiler_session("repeated_types", repeated_types_table_schema) + + +@pytest.fixture(scope="session") +def compiler_session_w_nested_structs_types(nested_structs_types_table_schema): + """Compiler session for nested STRUCT data types.""" + return _create_compiler_session( + "nested_structs_types", nested_structs_types_table_schema + ) + + +@pytest.fixture(scope="session") +def compiler_session_w_json_types(json_types_table_schema): + """Compiler session for JSON data types.""" + return _create_compiler_session("json_types", json_types_table_schema) + + +@pytest.fixture(scope="session") +def scalar_types_table_schema() -> typing.Sequence[bigquery.SchemaField]: return [ bigquery.SchemaField("bool_col", "BOOLEAN"), bigquery.SchemaField("bytes_col", "BYTES"), @@ -68,7 +92,7 @@ def scalars_types_table_schema() -> typing.Sequence[bigquery.SchemaField]: @pytest.fixture(scope="session") -def scalars_types_df(compiler_session) -> bpd.DataFrame: +def scalar_types_df(compiler_session) -> bpd.DataFrame: """Returns a BigFrames DataFrame containing all scalar types and using the `rowindex` column as the index.""" bf_df = compiler_session.read_gbq_table("bigframes-dev.sqlglot_test.scalar_types") @@ -77,7 +101,7 @@ def scalars_types_df(compiler_session) -> bpd.DataFrame: @pytest.fixture(scope="session") -def scalars_types_pandas_df() -> pd.DataFrame: +def scalar_types_pandas_df() -> pd.DataFrame: """Returns a pandas DataFrame containing all scalar types and using the `rowindex` column as the index.""" # TODO: add tests for empty dataframes @@ -91,6 +115,40 @@ def scalars_types_pandas_df() -> pd.DataFrame: return df +@pytest.fixture(scope="session") +def nested_structs_types_table_schema() -> typing.Sequence[bigquery.SchemaField]: + return [ + bigquery.SchemaField("id", "INTEGER"), + bigquery.SchemaField( + "people", + "RECORD", + fields=[ + bigquery.SchemaField("name", "STRING"), + bigquery.SchemaField("age", "INTEGER"), + bigquery.SchemaField( + "address", + "RECORD", + fields=[ + bigquery.SchemaField("city", "STRING"), + bigquery.SchemaField("country", "STRING"), + ], + ), + ], + ), + ] + + +@pytest.fixture(scope="session") +def nested_structs_types_df(compiler_session_w_nested_structs_types) -> bpd.DataFrame: + """Returns a BigFrames DataFrame containing all scalar types and using the `rowindex` + column as the index.""" + bf_df = compiler_session_w_nested_structs_types.read_gbq_table( + "bigframes-dev.sqlglot_test.nested_structs_types" + ) + bf_df = bf_df.set_index("id", drop=False) + return bf_df + + @pytest.fixture(scope="session") def nested_structs_pandas_df() -> pd.DataFrame: """Returns a pandas DataFrame containing STRUCT types and using the `id` @@ -117,7 +175,32 @@ def nested_structs_pandas_df() -> pd.DataFrame: @pytest.fixture(scope="session") -def repeated_pandas_df() -> pd.DataFrame: +def repeated_types_table_schema() -> typing.Sequence[bigquery.SchemaField]: + return [ + bigquery.SchemaField("rowindex", "INTEGER"), + bigquery.SchemaField("int_list_col", "INTEGER", "REPEATED"), + bigquery.SchemaField("bool_list_col", "BOOLEAN", "REPEATED"), + bigquery.SchemaField("float_list_col", "FLOAT", "REPEATED"), + bigquery.SchemaField("date_list_col", "DATE", "REPEATED"), + bigquery.SchemaField("date_time_list_col", "DATETIME", "REPEATED"), + bigquery.SchemaField("numeric_list_col", "NUMERIC", "REPEATED"), + bigquery.SchemaField("string_list_col", "STRING", "REPEATED"), + ] + + +@pytest.fixture(scope="session") +def repeated_types_df(compiler_session_w_repeated_types) -> bpd.DataFrame: + """Returns a BigFrames DataFrame containing all scalar types and using the `rowindex` + column as the index.""" + bf_df = compiler_session_w_repeated_types.read_gbq_table( + "bigframes-dev.sqlglot_test.repeated_types" + ) + bf_df = bf_df.set_index("rowindex", drop=False) + return bf_df + + +@pytest.fixture(scope="session") +def repeated_types_pandas_df() -> pd.DataFrame: """Returns a pandas DataFrame containing LIST types and using the `rowindex` column as the index.""" @@ -125,10 +208,31 @@ def repeated_pandas_df() -> pd.DataFrame: DATA_DIR / "repeated.jsonl", lines=True, ) + # TODO: add dtype conversion here if needed. df = df.set_index("rowindex") return df +@pytest.fixture(scope="session") +def json_types_table_schema() -> typing.Sequence[bigquery.SchemaField]: + return [ + bigquery.SchemaField("rowindex", "INTEGER"), + bigquery.SchemaField("json_col", "JSON"), + ] + + +@pytest.fixture(scope="session") +def json_types_df(compiler_session_w_json_types) -> bpd.DataFrame: + """Returns a BigFrames DataFrame containing JSON types and using the `rowindex` + column as the index.""" + bf_df = compiler_session_w_json_types.read_gbq_table( + "bigframes-dev.sqlglot_test.json_types" + ) + # TODO(b/427305807): Why `drop=False` will produce two "rowindex" columns? + bf_df = bf_df.set_index("rowindex", drop=True) + return bf_df + + @pytest.fixture(scope="session") def json_pandas_df() -> pd.DataFrame: """Returns a pandas DataFrame containing JSON types and using the `rowindex` @@ -149,8 +253,10 @@ def json_pandas_df() -> pd.DataFrame: ] df = pd.DataFrame( { + "rowindex": pd.Series(range(len(json_data)), dtype=dtypes.INT_DTYPE), "json_col": pd.Series(json_data, dtype=dtypes.JSON_DTYPE), }, - index=pd.Series(range(len(json_data)), dtype=dtypes.INT_DTYPE), ) + # TODO(b/427305807): Why `drop=False` will produce two "rowindex" columns? + df = df.set_index("rowindex", drop=True) return df diff --git a/tests/unit/core/compile/sqlglot/expressions/__init__.py b/tests/unit/core/compile/sqlglot/expressions/__init__.py new file mode 100644 index 0000000000..0a2669d7a2 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_scalar_expr/test_compile_numerical_add/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_add_numeric/out.sql similarity index 100% rename from tests/unit/core/compile/sqlglot/snapshots/test_compile_scalar_expr/test_compile_numerical_add/out.sql rename to tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_add_numeric/out.sql diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_scalar_expr/test_compile_numerical_add_w_scalar/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_add_numeric_w_scalar/out.sql similarity index 100% rename from tests/unit/core/compile/sqlglot/snapshots/test_compile_scalar_expr/test_compile_numerical_add_w_scalar/out.sql rename to tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_add_numeric_w_scalar/out.sql diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_scalar_expr/test_compile_string_add/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_add_string/out.sql similarity index 100% rename from tests/unit/core/compile/sqlglot/snapshots/test_compile_scalar_expr/test_compile_string_add/out.sql rename to tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_add_string/out.sql diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_array_index/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_array_index/out.sql new file mode 100644 index 0000000000..33a8bded13 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_array_index/out.sql @@ -0,0 +1,15 @@ +WITH `bfcte_0` AS ( + SELECT + `rowindex` AS `bfcol_0`, + `string_list_col` AS `bfcol_1` + FROM `bigframes-dev`.`sqlglot_test`.`repeated_types` +), `bfcte_1` AS ( + SELECT + *, + `bfcol_1`[SAFE_OFFSET(1)] AS `bfcol_4` + FROM `bfcte_0` +) +SELECT + `bfcol_0` AS `rowindex`, + `bfcol_4` AS `string_list_col` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_array_slice_with_only_start/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_array_slice_with_only_start/out.sql new file mode 100644 index 0000000000..34d2225931 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_array_slice_with_only_start/out.sql @@ -0,0 +1,21 @@ +WITH `bfcte_0` AS ( + SELECT + `rowindex` AS `bfcol_0`, + `string_list_col` AS `bfcol_1` + FROM `bigframes-dev`.`sqlglot_test`.`repeated_types` +), `bfcte_1` AS ( + SELECT + *, + ARRAY( + SELECT + el + FROM UNNEST(`bfcol_1`) AS el WITH OFFSET AS slice_idx + WHERE + slice_idx >= 1 + ) AS `bfcol_4` + FROM `bfcte_0` +) +SELECT + `bfcol_0` AS `rowindex`, + `bfcol_4` AS `string_list_col` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_array_slice_with_start_and_stop/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_array_slice_with_start_and_stop/out.sql new file mode 100644 index 0000000000..d46803ce7c --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_array_slice_with_start_and_stop/out.sql @@ -0,0 +1,21 @@ +WITH `bfcte_0` AS ( + SELECT + `rowindex` AS `bfcol_0`, + `string_list_col` AS `bfcol_1` + FROM `bigframes-dev`.`sqlglot_test`.`repeated_types` +), `bfcte_1` AS ( + SELECT + *, + ARRAY( + SELECT + el + FROM UNNEST(`bfcol_1`) AS el WITH OFFSET AS slice_idx + WHERE + slice_idx >= 1 AND slice_idx < 5 + ) AS `bfcol_4` + FROM `bfcte_0` +) +SELECT + `bfcol_0` AS `rowindex`, + `bfcol_4` AS `string_list_col` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_array_to_string/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_array_to_string/out.sql new file mode 100644 index 0000000000..e0db21f972 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_array_to_string/out.sql @@ -0,0 +1,15 @@ +WITH `bfcte_0` AS ( + SELECT + `rowindex` AS `bfcol_0`, + `string_list_col` AS `bfcol_1` + FROM `bigframes-dev`.`sqlglot_test`.`repeated_types` +), `bfcte_1` AS ( + SELECT + *, + ARRAY_TO_STRING(`bfcol_1`, '.') AS `bfcol_4` + FROM `bfcte_0` +) +SELECT + `bfcol_0` AS `rowindex`, + `bfcol_4` AS `string_list_col` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_compile_numerical_add_w_scalar/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_compile_numerical_add_w_scalar/out.sql new file mode 100644 index 0000000000..9c4b01a6df --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_compile_numerical_add_w_scalar/out.sql @@ -0,0 +1,16 @@ +WITH `bfcte_0` AS ( + SELECT + `int64_col` AS `bfcol_0`, + `rowindex` AS `bfcol_1` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + `bfcol_1` AS `bfcol_4`, + `bfcol_0` + 1 AS `bfcol_5` + FROM `bfcte_0` +) +SELECT + `bfcol_4` AS `rowindex`, + `bfcol_5` AS `int64_col` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_compile_string_add/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_compile_string_add/out.sql new file mode 100644 index 0000000000..7a8ab83df1 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_compile_string_add/out.sql @@ -0,0 +1,16 @@ +WITH `bfcte_0` AS ( + SELECT + `rowindex` AS `bfcol_0`, + `string_col` AS `bfcol_1` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + `bfcol_0` AS `bfcol_4`, + CONCAT(`bfcol_1`, 'a') AS `bfcol_5` + FROM `bfcte_0` +) +SELECT + `bfcol_4` AS `rowindex`, + `bfcol_5` AS `string_col` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/test_compile_scalar_expr.py b/tests/unit/core/compile/sqlglot/expressions/test_binary_compiler.py similarity index 72% rename from tests/unit/core/compile/sqlglot/test_compile_scalar_expr.py rename to tests/unit/core/compile/sqlglot/expressions/test_binary_compiler.py index 862ee2467c..f3c96e9253 100644 --- a/tests/unit/core/compile/sqlglot/test_compile_scalar_expr.py +++ b/tests/unit/core/compile/sqlglot/expressions/test_binary_compiler.py @@ -19,19 +19,25 @@ pytest.importorskip("pytest_snapshot") -def test_compile_numerical_add(scalars_types_df: bpd.DataFrame, snapshot): - bf_df = scalars_types_df[["int64_col"]] +def test_add_numeric(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df[["int64_col"]] + bf_df["int64_col"] = bf_df["int64_col"] + bf_df["int64_col"] + snapshot.assert_match(bf_df.sql, "out.sql") -def test_compile_numerical_add_w_scalar(scalars_types_df: bpd.DataFrame, snapshot): - bf_df = scalars_types_df[["int64_col"]] +def test_add_numeric_w_scalar(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df[["int64_col"]] + bf_df["int64_col"] = bf_df["int64_col"] + 1 + snapshot.assert_match(bf_df.sql, "out.sql") -def test_compile_string_add(scalars_types_df: bpd.DataFrame, snapshot): - bf_df = scalars_types_df[["string_col"]] +def test_add_string(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df[["string_col"]] + bf_df["string_col"] = bf_df["string_col"] + "a" + snapshot.assert_match(bf_df.sql, "out.sql") diff --git a/tests/unit/core/compile/sqlglot/expressions/test_op_registration.py b/tests/unit/core/compile/sqlglot/expressions/test_op_registration.py new file mode 100644 index 0000000000..1c49dde6ca --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/test_op_registration.py @@ -0,0 +1,43 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from sqlglot import expressions as sge + +from bigframes.core.compile.sqlglot.expressions import op_registration +from bigframes.operations import numeric_ops + + +def test_register_then_get(): + reg = op_registration.OpRegistration() + input = sge.to_identifier("A") + op = numeric_ops.add_op + + @reg.register(numeric_ops.AddOp) + def test_func(op: numeric_ops.AddOp, input: sge.Expression) -> sge.Expression: + return input + + assert reg[numeric_ops.add_op](op, input) == test_func(op, input) + assert reg[numeric_ops.add_op.name](op, input) == test_func(op, input) + + +def test_register_function_first_argument_is_not_scalar_op_raise_error(): + reg = op_registration.OpRegistration() + + @reg.register(numeric_ops.AddOp) + def test_func(input: sge.Expression) -> sge.Expression: + return input + + with pytest.raises(ValueError, match=r".*first parameter must be an operator.*"): + test_func(sge.to_identifier("A")) diff --git a/tests/unit/core/compile/sqlglot/expressions/test_unary_compiler.py b/tests/unit/core/compile/sqlglot/expressions/test_unary_compiler.py new file mode 100644 index 0000000000..317c2f891b --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/test_unary_compiler.py @@ -0,0 +1,44 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +from bigframes import bigquery +import bigframes.pandas as bpd + +pytest.importorskip("pytest_snapshot") + + +def test_array_to_string(repeated_types_df: bpd.DataFrame, snapshot): + result = bigquery.array_to_string(repeated_types_df["string_list_col"], ".") + + snapshot.assert_match(result.to_frame().sql, "out.sql") + + +def test_array_index(repeated_types_df: bpd.DataFrame, snapshot): + result = repeated_types_df["string_list_col"].list[1] + + snapshot.assert_match(result.to_frame().sql, "out.sql") + + +def test_array_slice_with_only_start(repeated_types_df: bpd.DataFrame, snapshot): + result = repeated_types_df["string_list_col"].list[1:] + + snapshot.assert_match(result.to_frame().sql, "out.sql") + + +def test_array_slice_with_start_and_stop(repeated_types_df: bpd.DataFrame, snapshot): + result = repeated_types_df["string_list_col"].list[1:5] + + snapshot.assert_match(result.to_frame().sql, "out.sql") diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_concat/test_compile_concat/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_concat/test_compile_concat/out.sql index 8da545b8fa..62e22a6a19 100644 --- a/tests/unit/core/compile/sqlglot/snapshots/test_compile_concat/test_compile_concat/out.sql +++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_concat/test_compile_concat/out.sql @@ -49,21 +49,21 @@ WITH `bfcte_1` AS ( * FROM ( SELECT - bfcol_17 AS `bfcol_46`, - bfcol_18 AS `bfcol_47`, - bfcol_19 AS `bfcol_48`, - bfcol_20 AS `bfcol_49`, - bfcol_21 AS `bfcol_50`, - bfcol_22 AS `bfcol_51` + `bfcol_17` AS `bfcol_46`, + `bfcol_18` AS `bfcol_47`, + `bfcol_19` AS `bfcol_48`, + `bfcol_20` AS `bfcol_49`, + `bfcol_21` AS `bfcol_50`, + `bfcol_22` AS `bfcol_51` FROM `bfcte_6` UNION ALL SELECT - bfcol_40 AS `bfcol_46`, - bfcol_41 AS `bfcol_47`, - bfcol_42 AS `bfcol_48`, - bfcol_43 AS `bfcol_49`, - bfcol_44 AS `bfcol_50`, - bfcol_45 AS `bfcol_51` + `bfcol_40` AS `bfcol_46`, + `bfcol_41` AS `bfcol_47`, + `bfcol_42` AS `bfcol_48`, + `bfcol_43` AS `bfcol_49`, + `bfcol_44` AS `bfcol_50`, + `bfcol_45` AS `bfcol_51` FROM `bfcte_7` ) ) diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_explode/test_compile_explode_dataframe/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_explode/test_compile_explode_dataframe/out.sql new file mode 100644 index 0000000000..679da58f44 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_explode/test_compile_explode_dataframe/out.sql @@ -0,0 +1,21 @@ +WITH `bfcte_0` AS ( + SELECT + `rowindex` AS `bfcol_0`, + `int_list_col` AS `bfcol_1`, + `string_list_col` AS `bfcol_2` + FROM `bigframes-dev`.`sqlglot_test`.`repeated_types` +), `bfcte_1` AS ( + SELECT + * + REPLACE (`bfcol_1`[SAFE_OFFSET(`bfcol_13`)] AS `bfcol_1`, `bfcol_2`[SAFE_OFFSET(`bfcol_13`)] AS `bfcol_2`) + FROM `bfcte_0` + CROSS JOIN UNNEST(GENERATE_ARRAY(0, LEAST(ARRAY_LENGTH(`bfcol_1`) - 1, ARRAY_LENGTH(`bfcol_2`) - 1))) AS `bfcol_13` WITH OFFSET AS `bfcol_7` +) +SELECT + `bfcol_0` AS `rowindex`, + `bfcol_0` AS `rowindex_1`, + `bfcol_1` AS `int_list_col`, + `bfcol_2` AS `string_list_col` +FROM `bfcte_1` +ORDER BY + `bfcol_7` ASC NULLS LAST \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_explode/test_compile_explode_series/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_explode/test_compile_explode_series/out.sql new file mode 100644 index 0000000000..8bfd1eb005 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_explode/test_compile_explode_series/out.sql @@ -0,0 +1,18 @@ +WITH `bfcte_0` AS ( + SELECT + `rowindex` AS `bfcol_0`, + `int_list_col` AS `bfcol_1` + FROM `bigframes-dev`.`sqlglot_test`.`repeated_types` +), `bfcte_1` AS ( + SELECT + * + REPLACE (`bfcol_8` AS `bfcol_1`) + FROM `bfcte_0` + CROSS JOIN UNNEST(`bfcol_1`) AS `bfcol_8` WITH OFFSET AS `bfcol_4` +) +SELECT + `bfcol_0` AS `rowindex`, + `bfcol_1` AS `int_list_col` +FROM `bfcte_1` +ORDER BY + `bfcol_4` ASC NULLS LAST \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_filter/test_compile_filter/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_filter/test_compile_filter/out.sql new file mode 100644 index 0000000000..9ca7fb6a74 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_filter/test_compile_filter/out.sql @@ -0,0 +1,25 @@ +WITH `bfcte_0` AS ( + SELECT + `int64_col` AS `bfcol_0`, + `rowindex` AS `bfcol_1` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + `bfcol_1` AS `bfcol_5`, + `bfcol_1` AS `bfcol_6`, + `bfcol_0` AS `bfcol_7`, + `bfcol_1` >= 1 AS `bfcol_8` + FROM `bfcte_0` +), `bfcte_2` AS ( + SELECT + * + FROM `bfcte_1` + WHERE + `bfcol_8` +) +SELECT + `bfcol_5` AS `rowindex`, + `bfcol_6` AS `rowindex_1`, + `bfcol_7` AS `int64_col` +FROM `bfcte_2` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_json_df/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_json_df/out.sql index 100036d75f..4e21266b87 100644 --- a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_json_df/out.sql +++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_json_df/out.sql @@ -1,10 +1,11 @@ WITH `bfcte_0` AS ( SELECT * - FROM UNNEST(ARRAY>[STRUCT(PARSE_JSON('null'), 0), STRUCT(PARSE_JSON('true'), 1), STRUCT(PARSE_JSON('100'), 2), STRUCT(PARSE_JSON('0.98'), 3), STRUCT(PARSE_JSON('"a string"'), 4), STRUCT(PARSE_JSON('[]'), 5), STRUCT(PARSE_JSON('[1,2,3]'), 6), STRUCT(PARSE_JSON('[{"a":1},{"a":2},{"a":null},{}]'), 7), STRUCT(PARSE_JSON('"100"'), 8), STRUCT(PARSE_JSON('{"date":"2024-07-16"}'), 9), STRUCT(PARSE_JSON('{"int_value":2,"null_filed":null}'), 10), STRUCT(PARSE_JSON('{"list_data":[10,20,30]}'), 11)]) + FROM UNNEST(ARRAY>[STRUCT(0, PARSE_JSON('null'), 0), STRUCT(1, PARSE_JSON('true'), 1), STRUCT(2, PARSE_JSON('100'), 2), STRUCT(3, PARSE_JSON('0.98'), 3), STRUCT(4, PARSE_JSON('"a string"'), 4), STRUCT(5, PARSE_JSON('[]'), 5), STRUCT(6, PARSE_JSON('[1,2,3]'), 6), STRUCT(7, PARSE_JSON('[{"a":1},{"a":2},{"a":null},{}]'), 7), STRUCT(8, PARSE_JSON('"100"'), 8), STRUCT(9, PARSE_JSON('{"date":"2024-07-16"}'), 9), STRUCT(10, PARSE_JSON('{"int_value":2,"null_filed":null}'), 10), STRUCT(11, PARSE_JSON('{"list_data":[10,20,30]}'), 11)]) ) SELECT - `bfcol_0` AS `json_col` + `bfcol_0` AS `rowindex`, + `bfcol_1` AS `json_col` FROM `bfcte_0` ORDER BY - `bfcol_1` ASC NULLS LAST \ No newline at end of file + `bfcol_2` ASC NULLS LAST \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable_w_json_types/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable_w_json_types/out.sql new file mode 100644 index 0000000000..4e8f61d75d --- /dev/null +++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable_w_json_types/out.sql @@ -0,0 +1,10 @@ +WITH `bfcte_0` AS ( + SELECT + `rowindex` AS `bfcol_0`, + `json_col` AS `bfcol_1` + FROM `bigframes-dev`.`sqlglot_test`.`json_types` +) +SELECT + `bfcol_0` AS `rowindex`, + `bfcol_1` AS `json_col` +FROM `bfcte_0` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable_w_nested_structs_types/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable_w_nested_structs_types/out.sql new file mode 100644 index 0000000000..75c4a86e18 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable_w_nested_structs_types/out.sql @@ -0,0 +1,11 @@ +WITH `bfcte_0` AS ( + SELECT + `id` AS `bfcol_0`, + `people` AS `bfcol_1` + FROM `bigframes-dev`.`sqlglot_test`.`nested_structs_types` +) +SELECT + `bfcol_0` AS `id`, + `bfcol_0` AS `id_1`, + `bfcol_1` AS `people` +FROM `bfcte_0` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable_w_repeated_types/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable_w_repeated_types/out.sql new file mode 100644 index 0000000000..2436c01a44 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable_w_repeated_types/out.sql @@ -0,0 +1,23 @@ +WITH `bfcte_0` AS ( + SELECT + `rowindex` AS `bfcol_0`, + `int_list_col` AS `bfcol_1`, + `bool_list_col` AS `bfcol_2`, + `float_list_col` AS `bfcol_3`, + `date_list_col` AS `bfcol_4`, + `date_time_list_col` AS `bfcol_5`, + `numeric_list_col` AS `bfcol_6`, + `string_list_col` AS `bfcol_7` + FROM `bigframes-dev`.`sqlglot_test`.`repeated_types` +) +SELECT + `bfcol_0` AS `rowindex`, + `bfcol_0` AS `rowindex_1`, + `bfcol_1` AS `int_list_col`, + `bfcol_2` AS `bool_list_col`, + `bfcol_3` AS `float_list_col`, + `bfcol_4` AS `date_list_col`, + `bfcol_5` AS `date_time_list_col`, + `bfcol_6` AS `numeric_list_col`, + `bfcol_7` AS `string_list_col` +FROM `bfcte_0` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/test_compile_concat.py b/tests/unit/core/compile/sqlglot/test_compile_concat.py index ec7e83a4b0..79f73d3113 100644 --- a/tests/unit/core/compile/sqlglot/test_compile_concat.py +++ b/tests/unit/core/compile/sqlglot/test_compile_concat.py @@ -22,11 +22,11 @@ def test_compile_concat( - scalars_types_pandas_df: pd.DataFrame, compiler_session: bigframes.Session, snapshot + scalar_types_pandas_df: pd.DataFrame, compiler_session: bigframes.Session, snapshot ): # TODO: concat two same dataframes, which SQL does not get reused. # TODO: concat dataframes from a gbq table but trigger a windows compiler. - df1 = bpd.DataFrame(scalars_types_pandas_df, session=compiler_session) + df1 = bpd.DataFrame(scalar_types_pandas_df, session=compiler_session) df1 = df1[["rowindex", "int64_col", "string_col"]] concat_df = bpd.concat([df1, df1]) snapshot.assert_match(concat_df.sql, "out.sql") diff --git a/tests/unit/core/compile/sqlglot/test_compile_explode.py b/tests/unit/core/compile/sqlglot/test_compile_explode.py new file mode 100644 index 0000000000..34adbbd23a --- /dev/null +++ b/tests/unit/core/compile/sqlglot/test_compile_explode.py @@ -0,0 +1,31 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +import bigframes.pandas as bpd + +pytest.importorskip("pytest_snapshot") + + +# TODO: check order by with offset +def test_compile_explode_series(repeated_types_df: bpd.DataFrame, snapshot): + s = repeated_types_df["int_list_col"].explode() + snapshot.assert_match(s.to_frame().sql, "out.sql") + + +def test_compile_explode_dataframe(repeated_types_df: bpd.DataFrame, snapshot): + exploded_columns = ["int_list_col", "string_list_col"] + df = repeated_types_df[["rowindex", *exploded_columns]].explode(exploded_columns) + snapshot.assert_match(df.sql, "out.sql") diff --git a/tests/unit/core/compile/sqlglot/test_compile_filter.py b/tests/unit/core/compile/sqlglot/test_compile_filter.py new file mode 100644 index 0000000000..0afb5eb45b --- /dev/null +++ b/tests/unit/core/compile/sqlglot/test_compile_filter.py @@ -0,0 +1,25 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +import bigframes.pandas as bpd + +pytest.importorskip("pytest_snapshot") + + +def test_compile_filter(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df[["rowindex", "int64_col"]] + bf_filter = bf_df[bf_df["rowindex"] >= 1] + snapshot.assert_match(bf_filter.sql, "out.sql") diff --git a/tests/unit/core/compile/sqlglot/test_compile_readlocal.py b/tests/unit/core/compile/sqlglot/test_compile_readlocal.py index 58587da129..7307fd9b4e 100644 --- a/tests/unit/core/compile/sqlglot/test_compile_readlocal.py +++ b/tests/unit/core/compile/sqlglot/test_compile_readlocal.py @@ -22,34 +22,39 @@ def test_compile_readlocal( - scalars_types_pandas_df: pd.DataFrame, compiler_session: bigframes.Session, snapshot + scalar_types_pandas_df: pd.DataFrame, compiler_session: bigframes.Session, snapshot ): - bf_df = bpd.DataFrame(scalars_types_pandas_df, session=compiler_session) + bf_df = bpd.DataFrame(scalar_types_pandas_df, session=compiler_session) snapshot.assert_match(bf_df.sql, "out.sql") def test_compile_readlocal_w_structs_df( nested_structs_pandas_df: pd.DataFrame, - compiler_session: bigframes.Session, + compiler_session_w_nested_structs_types: bigframes.Session, snapshot, ): - bf_df = bpd.DataFrame(nested_structs_pandas_df, session=compiler_session) + # TODO(b/427306734): Check why the output is different from the expected output. + bf_df = bpd.DataFrame( + nested_structs_pandas_df, session=compiler_session_w_nested_structs_types + ) snapshot.assert_match(bf_df.sql, "out.sql") def test_compile_readlocal_w_lists_df( - repeated_pandas_df: pd.DataFrame, - compiler_session: bigframes.Session, + repeated_types_pandas_df: pd.DataFrame, + compiler_session_w_repeated_types: bigframes.Session, snapshot, ): - bf_df = bpd.DataFrame(repeated_pandas_df, session=compiler_session) + bf_df = bpd.DataFrame( + repeated_types_pandas_df, session=compiler_session_w_repeated_types + ) snapshot.assert_match(bf_df.sql, "out.sql") def test_compile_readlocal_w_json_df( json_pandas_df: pd.DataFrame, - compiler_session: bigframes.Session, + compiler_session_w_json_types: bigframes.Session, snapshot, ): - bf_df = bpd.DataFrame(json_pandas_df, session=compiler_session) + bf_df = bpd.DataFrame(json_pandas_df, session=compiler_session_w_json_types) snapshot.assert_match(bf_df.sql, "out.sql") diff --git a/tests/unit/core/compile/sqlglot/test_compile_readtable.py b/tests/unit/core/compile/sqlglot/test_compile_readtable.py index 63849f093c..a5692e5fbf 100644 --- a/tests/unit/core/compile/sqlglot/test_compile_readtable.py +++ b/tests/unit/core/compile/sqlglot/test_compile_readtable.py @@ -19,17 +19,31 @@ pytest.importorskip("pytest_snapshot") -def test_compile_readtable(scalars_types_df: bpd.DataFrame, snapshot): - snapshot.assert_match(scalars_types_df.sql, "out.sql") +def test_compile_readtable(scalar_types_df: bpd.DataFrame, snapshot): + snapshot.assert_match(scalar_types_df.sql, "out.sql") -def test_compile_readtable_w_ordering(scalars_types_df: bpd.DataFrame, snapshot): - bf_df = scalars_types_df[["int64_col"]] +def test_compile_readtable_w_repeated_types(repeated_types_df: bpd.DataFrame, snapshot): + snapshot.assert_match(repeated_types_df.sql, "out.sql") + + +def test_compile_readtable_w_nested_structs_types( + nested_structs_types_df: bpd.DataFrame, snapshot +): + snapshot.assert_match(nested_structs_types_df.sql, "out.sql") + + +def test_compile_readtable_w_json_types(json_types_df: bpd.DataFrame, snapshot): + snapshot.assert_match(json_types_df.sql, "out.sql") + + +def test_compile_readtable_w_ordering(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df[["int64_col"]] bf_df = bf_df.sort_values("int64_col") snapshot.assert_match(bf_df.sql, "out.sql") -def test_compile_readtable_w_limit(scalars_types_df: bpd.DataFrame, snapshot): - bf_df = scalars_types_df[["int64_col"]] +def test_compile_readtable_w_limit(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df[["int64_col"]] bf_df = bf_df.sort_index().head(10) snapshot.assert_match(bf_df.sql, "out.sql") diff --git a/tests/unit/core/test_pyformat.py b/tests/unit/core/test_pyformat.py index 05110d8485..447ce37766 100644 --- a/tests/unit/core/test_pyformat.py +++ b/tests/unit/core/test_pyformat.py @@ -19,13 +19,19 @@ from __future__ import annotations +import datetime import decimal from typing import Any, Dict, List +import db_dtypes # type: ignore +import geopandas # type: ignore import google.cloud.bigquery import google.cloud.bigquery.table +import numpy import pandas +import pyarrow import pytest +import shapely.geometry # type: ignore from bigframes.core import pyformat from bigframes.testing import mocks @@ -91,42 +97,313 @@ def test_pyformat_with_no_variables(session): pytest.param( # Empty columns default to floating point, just like pandas. pandas.DataFrame({"empty column": []}), - "STRUCT<`empty column` FLOAT>", + "STRUCT<`empty column` FLOAT64>", id="empty column", ), + # Regression tests for b/428190014. + # + # Test every BigQuery type we support, especially those where the legacy + # SQL type name differs from the GoogleSQL type name. + # + # See: + # https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types + # and compare to the legacy types at + # https://cloud.google.com/bigquery/docs/data-types + # + # Test these against the real BigQuery dry run API in + # tests/system/small/pandas/io/api/test_read_gbq_colab.py pytest.param( pandas.DataFrame( { - "col1": [1, 2, 3], - "col2": ["a", "b", "c"], - "col3": [ - decimal.Decimal(1), - decimal.Decimal(2), - decimal.Decimal(3), - ], + "ints": pandas.Series( + [[1], [2], [3]], + dtype=pandas.ArrowDtype(pyarrow.list_(pyarrow.int64())), + ), + "floats": pandas.Series( + [[1.0], [2.0], [3.0]], + dtype=pandas.ArrowDtype(pyarrow.list_(pyarrow.float64())), + ), } ), - "STRUCT<`col1` INTEGER, `col2` STRING, `col3` NUMERIC>", - id="scalars", + "STRUCT<`ints` ARRAY, `floats` ARRAY>", + id="arrays", ), pytest.param( pandas.DataFrame( - {"array col": [[1, 2, 3]], "another array": [["a", "b", "c"]]} + { + "bool": pandas.Series([True, False, True], dtype="bool"), + "boolean": pandas.Series([True, None, True], dtype="boolean"), + "object": pandas.Series([True, None, True], dtype="object"), + "arrow": pandas.Series( + [True, None, True], dtype=pandas.ArrowDtype(pyarrow.bool_()) + ), + } ), - "STRUCT<`array col` ARRAY, `another array` ARRAY>", - id="arrays", + "STRUCT<`bool` BOOL, `boolean` BOOL, `object` BOOL, `arrow` BOOL>", + id="bools", ), pytest.param( pandas.DataFrame( { - "struct col": [ - {"subfield": {"subsubfield": 1}, "subfield2": 2}, - ], + "bytes": pandas.Series([b"a", b"b", b"c"], dtype=numpy.bytes_), + "object": pandas.Series([b"a", None, b"c"], dtype="object"), + "arrow": pandas.Series( + [b"a", None, b"c"], dtype=pandas.ArrowDtype(pyarrow.binary()) + ), + } + ), + "STRUCT<`bytes` BYTES, `object` BYTES, `arrow` BYTES>", + id="bytes", + ), + pytest.param( + pandas.DataFrame( + { + "object": pandas.Series( + [ + datetime.date(2023, 11, 23), + None, + datetime.date(1970, 1, 1), + ], + dtype="object", + ), + "arrow": pandas.Series( + [ + datetime.date(2023, 11, 23), + None, + datetime.date(1970, 1, 1), + ], + dtype=pandas.ArrowDtype(pyarrow.date32()), + ), + } + ), + "STRUCT<`object` DATE, `arrow` DATE>", + id="dates", + ), + pytest.param( + pandas.DataFrame( + { + "object": pandas.Series( + [ + datetime.datetime(2023, 11, 23, 13, 14, 15), + None, + datetime.datetime(1970, 1, 1, 0, 0, 0), + ], + dtype="object", + ), + "datetime64": pandas.Series( + [ + datetime.datetime(2023, 11, 23, 13, 14, 15), + None, + datetime.datetime(1970, 1, 1, 0, 0, 0), + ], + dtype="datetime64[us]", + ), + "arrow": pandas.Series( + [ + datetime.datetime(2023, 11, 23, 13, 14, 15), + None, + datetime.datetime(1970, 1, 1, 0, 0, 0), + ], + dtype=pandas.ArrowDtype(pyarrow.timestamp("us")), + ), + } + ), + "STRUCT<`object` DATETIME, `datetime64` DATETIME, `arrow` DATETIME>", + id="datetimes", + ), + pytest.param( + pandas.DataFrame( + { + "object": pandas.Series( + [ + shapely.geometry.Point(145.0, -37.8), + None, + shapely.geometry.Point(-122.3, 47.6), + ], + dtype="object", + ), + "geopandas": geopandas.GeoSeries( + [ + shapely.geometry.Point(145.0, -37.8), + None, + shapely.geometry.Point(-122.3, 47.6), + ] + ), + } + ), + "STRUCT<`object` GEOGRAPHY, `geopandas` GEOGRAPHY>", + id="geographys", + ), + # TODO(tswast): Add INTERVAL once BigFrames supports it. + pytest.param( + pandas.DataFrame( + { + # TODO(tswast): Is there an equivalent object type we can use here? + # TODO(tswast): Add built-in Arrow extension type + "db_dtypes": pandas.Series( + ["{}", None, "123"], + dtype=pandas.ArrowDtype(db_dtypes.JSONArrowType()), + ), + } + ), + "STRUCT<`db_dtypes` JSON>", + id="jsons", + ), + pytest.param( + pandas.DataFrame( + { + "int64": pandas.Series([1, 2, 3], dtype="int64"), + "Int64": pandas.Series([1, None, 3], dtype="Int64"), + "object": pandas.Series([1, None, 3], dtype="object"), + "arrow": pandas.Series( + [1, None, 3], dtype=pandas.ArrowDtype(pyarrow.int64()) + ), + } + ), + "STRUCT<`int64` INT64, `Int64` INT64, `object` INT64, `arrow` INT64>", + id="ints", + ), + pytest.param( + pandas.DataFrame( + { + "object": pandas.Series( + [decimal.Decimal("1.23"), None, decimal.Decimal("4.56")], + dtype="object", + ), + "arrow": pandas.Series( + [decimal.Decimal("1.23"), None, decimal.Decimal("4.56")], + dtype=pandas.ArrowDtype(pyarrow.decimal128(38, 9)), + ), + } + ), + "STRUCT<`object` NUMERIC, `arrow` NUMERIC>", + id="numerics", + ), + pytest.param( + pandas.DataFrame( + { + # TODO(tswast): Add object type for BIGNUMERIC. Can bigframes disambiguate? + "arrow": pandas.Series( + [decimal.Decimal("1.23"), None, decimal.Decimal("4.56")], + dtype=pandas.ArrowDtype(pyarrow.decimal256(76, 38)), + ), + } + ), + "STRUCT<`arrow` BIGNUMERIC>", + id="bignumerics", + ), + pytest.param( + pandas.DataFrame( + { + "float64": pandas.Series([1.23, None, 4.56], dtype="float64"), + "Float64": pandas.Series([1.23, None, 4.56], dtype="Float64"), + "object": pandas.Series([1.23, None, 4.56], dtype="object"), + "arrow": pandas.Series( + [1.23, None, 4.56], dtype=pandas.ArrowDtype(pyarrow.float64()) + ), } ), - "STRUCT<`struct col` STRUCT<`subfield` STRUCT<`subsubfield` INTEGER>, `subfield2` INTEGER>>", + "STRUCT<`float64` FLOAT64, `Float64` FLOAT64, `object` FLOAT64, `arrow` FLOAT64>", + id="floats", + ), + # TODO(tswast): Add RANGE once BigFrames supports it. + pytest.param( + pandas.DataFrame( + { + "string": pandas.Series(["a", "b", "c"], dtype="string[python]"), + "object": pandas.Series(["a", None, "c"], dtype="object"), + "arrow": pandas.Series(["a", None, "c"], dtype="string[pyarrow]"), + } + ), + "STRUCT<`string` STRING, `object` STRING, `arrow` STRING>", + id="strings", + ), + pytest.param( + pandas.DataFrame( + { + # TODO(tswast): Add object type for STRUCT? How to tell apart from JSON? + "arrow": pandas.Series( + [{"a": 1, "b": 1.0, "c": "c"}], + dtype=pandas.ArrowDtype( + pyarrow.struct( + [ + ("a", pyarrow.int64()), + ("b", pyarrow.float64()), + ("c", pyarrow.string()), + ] + ) + ), + ), + } + ), + "STRUCT<`arrow` STRUCT<`a` INT64, `b` FLOAT64, `c` STRING>>", id="structs", ), + pytest.param( + pandas.DataFrame( + { + "object": pandas.Series( + [ + datetime.time(0, 0, 0), + None, + datetime.time(13, 7, 11), + ], + dtype="object", + ), + "arrow": pandas.Series( + [ + datetime.time(0, 0, 0), + None, + datetime.time(13, 7, 11), + ], + dtype=pandas.ArrowDtype(pyarrow.time64("us")), + ), + } + ), + "STRUCT<`object` TIME, `arrow` TIME>", + id="times", + ), + pytest.param( + pandas.DataFrame( + { + "object": pandas.Series( + [ + datetime.datetime( + 2023, 11, 23, 13, 14, 15, tzinfo=datetime.timezone.utc + ), + None, + datetime.datetime( + 1970, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc + ), + ], + dtype="object", + ), + "datetime64": pandas.Series( + [ + datetime.datetime(2023, 11, 23, 13, 14, 15), + None, + datetime.datetime(1970, 1, 1, 0, 0, 0), + ], + dtype="datetime64[us]", + ).dt.tz_localize("UTC"), + "arrow": pandas.Series( + [ + datetime.datetime( + 2023, 11, 23, 13, 14, 15, tzinfo=datetime.timezone.utc + ), + None, + datetime.datetime( + 1970, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc + ), + ], + dtype=pandas.ArrowDtype(pyarrow.timestamp("us", "UTC")), + ), + } + ), + "STRUCT<`object` TIMESTAMP, `datetime64` TIMESTAMP, `arrow` TIMESTAMP>", + id="timestamps", + ), + # More complicated edge cases: pytest.param( pandas.DataFrame( { @@ -135,14 +412,14 @@ def test_pyformat_with_no_variables(session): ], } ), - "STRUCT<`array of struct col` ARRAY, `subfield2` INTEGER>>>", + "STRUCT<`array of struct col` ARRAY, `subfield2` INT64>>>", id="array_of_structs", ), pytest.param( pandas.DataFrame({"c1": [1, 2, 3], "c2": ["a", "b", "c"]}).rename( columns={"c1": "c", "c2": "c"} ), - "STRUCT<`c` INTEGER, `c_1` STRING>", + "STRUCT<`c` INT64, `c_1` STRING>", id="duplicate_column_names", ), ), diff --git a/tests/unit/core/tools/test_bigquery_schema.py b/tests/unit/core/tools/test_bigquery_schema.py index a5b0087801..aed8ae0323 100644 --- a/tests/unit/core/tools/test_bigquery_schema.py +++ b/tests/unit/core/tools/test_bigquery_schema.py @@ -9,9 +9,11 @@ "field, expected_sql", [ # Simple types - (bigquery.SchemaField("test_field", "INTEGER"), "INTEGER"), + # Note: the REST API will return Legacy SQL data types, but we need to + # map to GoogleSQL. See internal issue b/428190014. + (bigquery.SchemaField("test_field", "INTEGER"), "INT64"), (bigquery.SchemaField("test_field", "STRING"), "STRING"), - (bigquery.SchemaField("test_field", "BOOLEAN"), "BOOLEAN"), + (bigquery.SchemaField("test_field", "BOOLEAN"), "BOOL"), # RECORD/STRUCT types with nested fields directly ( bigquery.SchemaField( @@ -30,7 +32,7 @@ bigquery.SchemaField("another", "BOOLEAN"), ), ), - "STRUCT<`sub_field` INTEGER, `another` BOOLEAN>", + "STRUCT<`sub_field` INT64, `another` BOOL>", ), # Array is handled by _field_to_sql, instead. (bigquery.SchemaField("test_field", "NUMERIC", mode="REPEATED"), "NUMERIC"), @@ -54,7 +56,9 @@ def test_type_to_sql(field, expected_sql): "field, expected_sql", [ # Simple field - (bigquery.SchemaField("id", "INTEGER", "NULLABLE"), "`id` INTEGER"), + # Note: the REST API will return Legacy SQL data types, but we need to + # map to GoogleSQL. See internal issue b/428190014. + (bigquery.SchemaField("id", "INTEGER", "NULLABLE"), "`id` INT64"), (bigquery.SchemaField("name", "STRING", "NULLABLE"), "`name` STRING"), # Repeated field (bigquery.SchemaField("tags", "STRING", "REPEATED"), "`tags` ARRAY"), @@ -69,7 +73,7 @@ def test_type_to_sql(field, expected_sql): bigquery.SchemaField("zip", "INTEGER"), ), ), - "`addresses` ARRAY>", + "`addresses` ARRAY>", ), # Simple STRUCT ( @@ -82,7 +86,7 @@ def test_type_to_sql(field, expected_sql): bigquery.SchemaField("city", "STRING"), ), ), - "`person` STRUCT<`age` INTEGER, `city` STRING>", + "`person` STRUCT<`age` INT64, `city` STRING>", ), ], ) @@ -102,7 +106,7 @@ def test_field_to_sql(field, expected_sql): bigquery.SchemaField("id", "INTEGER"), bigquery.SchemaField("name", "STRING"), ), - "STRUCT<`id` INTEGER, `name` STRING>", + "STRUCT<`id` INT64, `name` STRING>", ), # Nested RECORD/STRUCT ( @@ -118,7 +122,7 @@ def test_field_to_sql(field, expected_sql): ), ), ), - "STRUCT<`item_id` INTEGER, `details` STRUCT<`price` NUMERIC, `currency` STRING>>", + "STRUCT<`item_id` INT64, `details` STRUCT<`price` NUMERIC, `currency` STRING>>", ), # Repeated field ( @@ -143,7 +147,7 @@ def test_field_to_sql(field, expected_sql): ), bigquery.SchemaField("timestamp", "TIMESTAMP"), ), - "STRUCT<`event_name` STRING, `participants` ARRAY>>, `timestamp` TIMESTAMP>", + "STRUCT<`event_name` STRING, `participants` ARRAY>>, `timestamp` TIMESTAMP>", ), ], ) @@ -163,7 +167,7 @@ def test_to_struct(bqschema, expected_sql): bigquery.SchemaField("id", "INTEGER"), bigquery.SchemaField("name", "STRING"), ), - "UNNEST(ARRAY>[])", + "UNNEST(ARRAY>[])", ), # Complex schema with nested and repeated fields ( @@ -179,7 +183,7 @@ def test_to_struct(bqschema, expected_sql): ), ), ), - "UNNEST(ARRAY>>>[])", + "UNNEST(ARRAY>>>[])", ), ], ) diff --git a/tests/unit/functions/test_remote_function.py b/tests/unit/functions/test_remote_function.py index 978281e5c9..ea09ac59d3 100644 --- a/tests/unit/functions/test_remote_function.py +++ b/tests/unit/functions/test_remote_function.py @@ -89,3 +89,57 @@ def function_without_return_annotation(myparam: int): match="'output_type' was not set .* missing a return type annotation", ): remote_function_decorator(function_without_return_annotation) + + +def test_deploy_remote_function(): + session = mocks.create_bigquery_session() + + def my_remote_func(x: int) -> int: + return x * 2 + + deployed = session.deploy_remote_function( + my_remote_func, cloud_function_service_account="test_sa@example.com" + ) + + # Test that the function would have been deployed somewhere. + assert deployed.bigframes_bigquery_function + + +def test_deploy_remote_function_with_name(): + session = mocks.create_bigquery_session() + + def my_remote_func(x: int) -> int: + return x * 2 + + deployed = session.deploy_remote_function( + my_remote_func, + name="my_custom_name", + cloud_function_service_account="test_sa@example.com", + ) + + # Test that the function would have been deployed somewhere. + assert "my_custom_name" in deployed.bigframes_bigquery_function + + +def test_deploy_udf(): + session = mocks.create_bigquery_session() + + def my_remote_func(x: int) -> int: + return x * 2 + + deployed = session.deploy_udf(my_remote_func) + + # Test that the function would have been deployed somewhere. + assert deployed.bigframes_bigquery_function + + +def test_deploy_udf_with_name(): + session = mocks.create_bigquery_session() + + def my_remote_func(x: int) -> int: + return x * 2 + + deployed = session.deploy_udf(my_remote_func, name="my_custom_name") + + # Test that the function would have been deployed somewhere. + assert "my_custom_name" in deployed.bigframes_bigquery_function diff --git a/tests/unit/session/test_io_arrow.py b/tests/unit/session/test_io_arrow.py new file mode 100644 index 0000000000..d5266220d9 --- /dev/null +++ b/tests/unit/session/test_io_arrow.py @@ -0,0 +1,133 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import datetime + +import pyarrow as pa +import pytest + +import bigframes.pandas as bpd +from bigframes.testing import mocks + + +@pytest.fixture(scope="module") +def session(): + # Use the mock session from bigframes.testing + return mocks.create_bigquery_session() + + +def test_read_arrow_empty_table(session): + empty_table = pa.Table.from_pydict( + { + "col_a": pa.array([], type=pa.int64()), + "col_b": pa.array([], type=pa.string()), + } + ) + df = session.read_arrow(empty_table) + assert isinstance(df, bpd.DataFrame) + assert df.shape == (0, 2) + assert list(df.columns) == ["col_a", "col_b"] + pd_df = df.to_pandas() + assert pd_df.empty + assert list(pd_df.columns) == ["col_a", "col_b"] + assert pd_df["col_a"].dtype == "Int64" + assert pd_df["col_b"].dtype == "string[pyarrow]" + + +@pytest.mark.parametrize( + "data,arrow_type,expected_bq_type_kind", + [ + ([1, 2], pa.int8(), "INTEGER"), + ([1, 2], pa.int16(), "INTEGER"), + ([1, 2], pa.int32(), "INTEGER"), + ([1, 2], pa.int64(), "INTEGER"), + ([1.0, 2.0], pa.float32(), "FLOAT"), + ([1.0, 2.0], pa.float64(), "FLOAT"), + ([True, False], pa.bool_(), "BOOLEAN"), + (["a", "b"], pa.string(), "STRING"), + (["a", "b"], pa.large_string(), "STRING"), + ([b"a", b"b"], pa.binary(), "BYTES"), + ([b"a", b"b"], pa.large_binary(), "BYTES"), + ( + [ + pa.scalar(1000, type=pa.duration("s")), + pa.scalar(2000, type=pa.duration("s")), + ], + pa.duration("s"), + "INTEGER", + ), + ([datetime.date(2023, 1, 1)], pa.date32(), "DATE"), + ( + [datetime.datetime(2023, 1, 1, 12, 0, 0, tzinfo=datetime.timezone.utc)], + pa.timestamp("s", tz="UTC"), + "TIMESTAMP", + ), + ( + [datetime.datetime(2023, 1, 1, 12, 0, 0, tzinfo=datetime.timezone.utc)], + pa.timestamp("ms", tz="UTC"), + "TIMESTAMP", + ), + ( + [datetime.datetime(2023, 1, 1, 12, 0, 0, tzinfo=datetime.timezone.utc)], + pa.timestamp("us", tz="UTC"), + "TIMESTAMP", + ), + ([datetime.time(12, 34, 56, 789000)], pa.time64("us"), "TIME"), + ], +) +def test_read_arrow_type_mappings(session, data, arrow_type, expected_bq_type_kind): + """ + Tests that various arrow types are mapped to the expected BigQuery types. + This is an indirect check via the resulting DataFrame's schema. + """ + pa_table = pa.Table.from_arrays([pa.array(data, type=arrow_type)], names=["col"]) + df = session.read_arrow(pa_table) + + bigquery_schema = df._block.expr.schema.to_bigquery() + assert len(bigquery_schema) == 2 # offsets + value + field = bigquery_schema[-1] + assert field.field_type.upper() == expected_bq_type_kind + + # Also check pandas dtype after conversion for good measure + pd_df = df.to_pandas() + assert pd_df["col"].shape == (len(data),) + + +def test_read_arrow_list_type(session): + pa_table = pa.Table.from_arrays( + [pa.array([[1, 2], [3, 4, 5]], type=pa.list_(pa.int64()))], names=["list_col"] + ) + df = session.read_arrow(pa_table) + + bigquery_schema = df._block.expr.schema.to_bigquery() + assert len(bigquery_schema) == 2 # offsets + value + field = bigquery_schema[-1] + assert field.mode.upper() == "REPEATED" + assert field.field_type.upper() == "INTEGER" + + +def test_read_arrow_struct_type(session): + struct_type = pa.struct([("a", pa.int64()), ("b", pa.string())]) + pa_table = pa.Table.from_arrays( + [pa.array([{"a": 1, "b": "x"}, {"a": 2, "b": "y"}], type=struct_type)], + names=["struct_col"], + ) + df = session.read_arrow(pa_table) + + bigquery_schema = df._block.expr.schema.to_bigquery() + assert len(bigquery_schema) == 2 # offsets + value + field = bigquery_schema[-1] + assert field.field_type.upper() == "RECORD" + assert field.fields[0].name == "a" + assert field.fields[1].name == "b" diff --git a/tests/unit/test_clients.py b/tests/unit/test_clients.py index 032512c26e..9daa759838 100644 --- a/tests/unit/test_clients.py +++ b/tests/unit/test_clients.py @@ -12,6 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. +from unittest import mock + +from google.cloud import bigquery_connection_v1, resourcemanager_v3 +from google.iam.v1 import policy_pb2 import pytest from bigframes import clients @@ -65,3 +69,27 @@ def test_get_canonical_bq_connection_id_invalid_path(): default_project="default-project", default_location="us", ) + + +def test_ensure_iam_binding(): + bq_connection_client = mock.create_autospec( + bigquery_connection_v1.ConnectionServiceClient, instance=True + ) + resource_manager_client = mock.create_autospec( + resourcemanager_v3.ProjectsClient, instance=True + ) + resource_manager_client.get_iam_policy.return_value = policy_pb2.Policy( + bindings=[ + policy_pb2.Binding( + role="roles/test.role1", members=["serviceAccount:serviceAccount1"] + ) + ] + ) + bq_connection_manager = clients.BqConnectionManager( + bq_connection_client, resource_manager_client + ) + bq_connection_manager._IAM_WAIT_SECONDS = 0 # no need to wait in test + bq_connection_manager._ensure_iam_binding( + "test-project", "serviceAccount2", "roles/test.role2" + ) + resource_manager_client.set_iam_policy.assert_called_once() diff --git a/tests/unit/test_dataframe_polars.py b/tests/unit/test_dataframe_polars.py index b434e473e9..f7f0cc80bb 100644 --- a/tests/unit/test_dataframe_polars.py +++ b/tests/unit/test_dataframe_polars.py @@ -2150,7 +2150,7 @@ def test_df_corrwith_series(scalars_dfs): operator.sub, operator.mul, operator.truediv, - # operator.floordiv, + operator.floordiv, operator.eq, operator.ne, operator.gt, @@ -2163,7 +2163,7 @@ def test_df_corrwith_series(scalars_dfs): "subtract", "multiply", "true_divide", - # "floor_divide", + "floor_divide", "eq", "ne", "gt", @@ -2217,8 +2217,8 @@ def test_scalar_binop_str_exception(scalars_dfs): (lambda x, y: x.rmul(y, axis="index")), (lambda x, y: x.truediv(y, axis="index")), (lambda x, y: x.rtruediv(y, axis="index")), - # (lambda x, y: x.floordiv(y, axis="index")), - # (lambda x, y: x.floordiv(y, axis="index")), + (lambda x, y: x.floordiv(y, axis="index")), + (lambda x, y: x.floordiv(y, axis="index")), (lambda x, y: x.gt(y, axis="index")), (lambda x, y: x.ge(y, axis="index")), (lambda x, y: x.lt(y, axis="index")), @@ -2233,8 +2233,8 @@ def test_scalar_binop_str_exception(scalars_dfs): "rmul", "truediv", "rtruediv", - # "floordiv", - # "rfloordiv", + "floordiv", + "rfloordiv", "gt", "ge", "lt", diff --git a/third_party/bigframes_vendored/google_cloud_bigquery/_pandas_helpers.py b/third_party/bigframes_vendored/google_cloud_bigquery/_pandas_helpers.py index 5e2a7a7ef0..3e35b1382e 100644 --- a/third_party/bigframes_vendored/google_cloud_bigquery/_pandas_helpers.py +++ b/third_party/bigframes_vendored/google_cloud_bigquery/_pandas_helpers.py @@ -17,6 +17,7 @@ import warnings +import db_dtypes import google.cloud.bigquery.schema as schema import pyarrow @@ -61,6 +62,7 @@ def pyarrow_timestamp(): "TIME": pyarrow_time, "TIMESTAMP": pyarrow_timestamp, "BIGNUMERIC": pyarrow_bignumeric, + "JSON": db_dtypes.JSONArrowType, } ARROW_SCALAR_IDS_TO_BQ = { # https://arrow.apache.org/docs/python/api/datatypes.html#type-classes diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 224fe25f16..0606032d34 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -4124,6 +4124,7 @@ def explode( **Examples:** >>> import bigframes.pandas as bpd + >>> import numpy as np >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({'A': [[0, 1, 2], [], [], [3, 4]], diff --git a/third_party/bigframes_vendored/version.py b/third_party/bigframes_vendored/version.py index 5d2de2f97f..4f3c9a5124 100644 --- a/third_party/bigframes_vendored/version.py +++ b/third_party/bigframes_vendored/version.py @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "2.8.0" +__version__ = "2.9.0" # {x-release-please-start-date} -__release_date__ = "2025-06-23" +__release_date__ = "2025-06-30" # {x-release-please-end} diff --git a/third_party/logo/colab-logo.png b/third_party/logo/colab-logo.png new file mode 100644 index 0000000000..75740a2b6a Binary files /dev/null and b/third_party/logo/colab-logo.png differ diff --git a/third_party/logo/github-logo.png b/third_party/logo/github-logo.png new file mode 100644 index 0000000000..8b25551a97 Binary files /dev/null and b/third_party/logo/github-logo.png differ
\n", " \n", - " \"Colab Run in Colab\n", + " \"Colab Run in Colab\n", " \n", " \n", " \n", - " \"GitHub\n", + " \"GitHub\n", " View on GitHub\n", " \n", " \n", " \n", - " \"Colab Run in Colab\n", + " \"Colab Run in Colab\n", " \n", " \n", " \n", - " \"GitHub\n", + " \"GitHub\n", " View on GitHub\n", " \n", " \n", " \n", - " \"Colab Run in Colab\n", + " \"Colab Run in Colab\n", " \n", " \n", " \n", - " \"GitHub\n", + " \"GitHub\n", " View on GitHub\n", " \n", " \n", " \n", - " \"Colab Run in Colab\n", + " \"Colab Run in Colab\n", " \n", " \n", " \n", - " \"GitHub\n", + " \"GitHub\n", " View on GitHub\n", " \n", " \n", " \n", - " \"Colab Run in Colab\n", + " \"Colab Run in Colab\n", " \n", " \n", " \n", - " \"GitHub\n", + " \"GitHub\n", " View on GitHub\n", " \n", " \n", " \n", - " \"Colab Run in Colab\n", + " \"Colab Run in Colab\n", " \n", " \n", " \n", - " \"GitHub\n", + " \"GitHub\n", " View on GitHub\n", " \n", " \n", " \n", - " \"Colab Run in Colab\n", + " \"Colab Run in Colab\n", " \n", " \n", " \n", - " \"GitHub\n", + " \"GitHub\n", " View on GitHub\n", " \n", " \n", " \n", - " \"Colab Run in Colab\n", + " \"Colab Run in Colab\n", " \n", " \n", " \n", - " \"GitHub\n", + " \"GitHub\n", " View on GitHub\n", " \n", " \n", " \n", - " \"Colab Run in Colab\n", + " \"Colab Run in Colab\n", " \n", " \n", " \n", - " \"GitHub\n", + " \"GitHub\n", " View on GitHub\n", " \n", " \n", " \n", - " \"Colab Run in Colab\n", + " \"Colab Run in Colab\n", " \n", " \n", " \n", - " \"GitHub\n", + " \"GitHub\n", " View on GitHub\n", " \n", "