From c53c86fdb4655040442e74a329df9282f932e681 Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Mon, 23 Jun 2025 11:55:45 -0700 Subject: [PATCH 01/28] chore: add compile_filter (#1817) --- bigframes/core/compile/sqlglot/compiler.py | 7 ++++++ .../core/compile/sqlglot/scalar_compiler.py | 7 ++++++ bigframes/core/compile/sqlglot/sqlglot_ir.py | 10 ++++++++ .../test_compile_filter/out.sql | 25 +++++++++++++++++++ .../compile/sqlglot/test_compile_filter.py | 25 +++++++++++++++++++ 5 files changed, 74 insertions(+) create mode 100644 tests/unit/core/compile/sqlglot/snapshots/test_compile_filter/test_compile_filter/out.sql create mode 100644 tests/unit/core/compile/sqlglot/test_compile_filter.py diff --git a/bigframes/core/compile/sqlglot/compiler.py b/bigframes/core/compile/sqlglot/compiler.py index 3b7abd8463..a38078b687 100644 --- a/bigframes/core/compile/sqlglot/compiler.py +++ b/bigframes/core/compile/sqlglot/compiler.py @@ -211,6 +211,13 @@ def compile_projection( ) return child.project(projected_cols) + @_compile_node.register + def compile_filter( + self, node: nodes.FilterNode, child: ir.SQLGlotIR + ) -> ir.SQLGlotIR: + condition = scalar_compiler.compile_scalar_expression(node.predicate) + return child.filter(condition) + @_compile_node.register def compile_concat( self, node: nodes.ConcatNode, *children: ir.SQLGlotIR diff --git a/bigframes/core/compile/sqlglot/scalar_compiler.py b/bigframes/core/compile/sqlglot/scalar_compiler.py index 00ec892620..59deb9c8f3 100644 --- a/bigframes/core/compile/sqlglot/scalar_compiler.py +++ b/bigframes/core/compile/sqlglot/scalar_compiler.py @@ -99,3 +99,10 @@ def compile_addop(op: ops.AddOp, left: TypedExpr, right: TypedExpr) -> sge.Expre # Numerical addition return sge.Add(this=left.expr, expression=right.expr) + + +def compile_ge( + op: ops.ge_op, left: TypedExpr, right: TypedExpr # type: ignore[valid-type] +) -> sge.Expression: + + return sge.GTE(this=left.expr, expression=right.expr) diff --git a/bigframes/core/compile/sqlglot/sqlglot_ir.py b/bigframes/core/compile/sqlglot/sqlglot_ir.py index 47dab209d0..3766ab3266 100644 --- a/bigframes/core/compile/sqlglot/sqlglot_ir.py +++ b/bigframes/core/compile/sqlglot/sqlglot_ir.py @@ -250,6 +250,16 @@ def project( new_expr = self._encapsulate_as_cte().select(*projected_cols_expr, append=True) return SQLGlotIR(expr=new_expr, uid_gen=self.uid_gen) + def filter( + self, + condition: sge.Expression, + ) -> SQLGlotIR: + """Filters the query with the given condition.""" + new_expr = self._encapsulate_as_cte() + return SQLGlotIR( + expr=new_expr.where(condition, append=False), uid_gen=self.uid_gen + ) + def insert( self, destination: bigquery.TableReference, diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_filter/test_compile_filter/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_filter/test_compile_filter/out.sql new file mode 100644 index 0000000000..9ca7fb6a74 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_filter/test_compile_filter/out.sql @@ -0,0 +1,25 @@ +WITH `bfcte_0` AS ( + SELECT + `int64_col` AS `bfcol_0`, + `rowindex` AS `bfcol_1` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + `bfcol_1` AS `bfcol_5`, + `bfcol_1` AS `bfcol_6`, + `bfcol_0` AS `bfcol_7`, + `bfcol_1` >= 1 AS `bfcol_8` + FROM `bfcte_0` +), `bfcte_2` AS ( + SELECT + * + FROM `bfcte_1` + WHERE + `bfcol_8` +) +SELECT + `bfcol_5` AS `rowindex`, + `bfcol_6` AS `rowindex_1`, + `bfcol_7` AS `int64_col` +FROM `bfcte_2` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/test_compile_filter.py b/tests/unit/core/compile/sqlglot/test_compile_filter.py new file mode 100644 index 0000000000..03b54f289a --- /dev/null +++ b/tests/unit/core/compile/sqlglot/test_compile_filter.py @@ -0,0 +1,25 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +import bigframes.pandas as bpd + +pytest.importorskip("pytest_snapshot") + + +def test_compile_filter(scalars_types_df: bpd.DataFrame, snapshot): + bf_df = scalars_types_df[["rowindex", "int64_col"]] + bf_filter = bf_df[bf_df["rowindex"] >= 1] + snapshot.assert_match(bf_filter.sql, "out.sql") From 7a83224cbf38d995321d222830671103cff48607 Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Mon, 23 Jun 2025 12:26:57 -0700 Subject: [PATCH 02/28] test: Add slice op cross validation testing (#1837) --- bigframes/session/polars_executor.py | 1 + tests/system/small/engines/test_slicing.py | 56 ++++++++++++++++++++++ 2 files changed, 57 insertions(+) create mode 100644 tests/system/small/engines/test_slicing.py diff --git a/bigframes/session/polars_executor.py b/bigframes/session/polars_executor.py index 6e3e15499d..b2f7f5ccd6 100644 --- a/bigframes/session/polars_executor.py +++ b/bigframes/session/polars_executor.py @@ -31,6 +31,7 @@ nodes.OrderByNode, nodes.ReversedNode, nodes.SelectionNode, + nodes.SliceNode, ) _COMPATIBLE_SCALAR_OPS = () diff --git a/tests/system/small/engines/test_slicing.py b/tests/system/small/engines/test_slicing.py new file mode 100644 index 0000000000..7340ff145b --- /dev/null +++ b/tests/system/small/engines/test_slicing.py @@ -0,0 +1,56 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +from bigframes.core import array_value, nodes +from bigframes.session import polars_executor +from bigframes.testing.engine_utils import assert_equivalence_execution + +pytest.importorskip("polars") + +# Polars used as reference as its fast and local. Generally though, prefer gbq engine where they disagree. +REFERENCE_ENGINE = polars_executor.PolarsExecutor() + + +@pytest.mark.parametrize("engine", ["polars", "bq"], indirect=True) +@pytest.mark.parametrize( + ("start", "stop", "step"), + [ + (1, None, None), + (None, 4, None), + (None, None, 2), + (None, 50_000_000_000, 1), + (5, 4, None), + (3, None, 2), + (1, 7, 2), + (1, 7, 50_000_000_000), + (-1, -7, -2), + (None, -7, -2), + (-1, None, -2), + (-7, -1, 2), + (-7, -1, None), + (-7, 7, None), + (7, -7, -2), + ], +) +def test_engines_slice( + scalars_array_value: array_value.ArrayValue, + engine, + start, + stop, + step, +): + node = nodes.SliceNode(scalars_array_value.node, start, stop, step) + assert_equivalence_execution(node, REFERENCE_ENGINE, engine) From 186353888db537b561ee994256f998df361b4071 Mon Sep 17 00:00:00 2001 From: Garrett Wu <6505921+GarrettWu@users.noreply.github.com> Date: Mon, 23 Jun 2025 19:26:15 -0700 Subject: [PATCH 03/28] docs: update ai.forecast notebook (#1844) --- .../bq_dataframes_ai_forecast.ipynb | 1691 ++++++++--------- 1 file changed, 838 insertions(+), 853 deletions(-) diff --git a/notebooks/generative_ai/bq_dataframes_ai_forecast.ipynb b/notebooks/generative_ai/bq_dataframes_ai_forecast.ipynb index 05e75b37f0..7734bd815d 100644 --- a/notebooks/generative_ai/bq_dataframes_ai_forecast.ipynb +++ b/notebooks/generative_ai/bq_dataframes_ai_forecast.ipynb @@ -71,10 +71,14 @@ "source": [ "PROJECT = \"bigframes-dev\" # replace with your project\n", "\n", + "import warnings\n", + "warnings.filterwarnings(\"ignore\") \n", + "\n", "import bigframes\n", "# Setup project\n", "bigframes.options.bigquery.project = PROJECT\n", "bigframes.options.display.progress_bar = None\n", + "bigframes.options.bigquery.ordering_mode = \"partial\" # Optional: partial ordering mode can accelerate executions and save costs\n", "\n", "import bigframes.pandas as bpd" ] @@ -138,603 +142,603 @@ " \n", " \n", " 0\n", - " 1304531\n", - " 597\n", - " 2016-08-05 10:55:00+00:00\n", - " San Francisco Caltrain 2 (330 Townsend)\n", - " 69\n", - " 2016-08-05 11:05:00+00:00\n", - " Powell Street BART\n", - " 39\n", - " 214\n", - " 95121\n", - " ...\n", - " Subscriber\n", - " <NA>\n", - " <NA>\n", - " <NA>\n", - " <NA>\n", - " <NA>\n", + " 201802092135083596\n", + " 788\n", + " 2018-02-09 21:35:08+00:00\n", + " 10th Ave at E 15th St\n", + " 222\n", + " 2018-02-09 21:48:17+00:00\n", + " 10th Ave at E 15th St\n", + " 222\n", + " 3596\n", " <NA>\n", + " ...\n", " <NA>\n", - " None\n", - " None\n", + " 37.792714\n", + " -122.24878\n", + " 37.792714\n", + " -122.24878\n", + " 1984\n", + " Male\n", + " Yes\n", + " POINT (-122.24878 37.79271)\n", + " POINT (-122.24878 37.79271)\n", " \n", " \n", " 1\n", - " 184870\n", - " 403\n", - " 2014-02-14 14:50:00+00:00\n", - " Howard at 2nd\n", - " 63\n", - " 2014-02-14 14:56:00+00:00\n", - " Commercial at Montgomery\n", - " 45\n", - " 342\n", - " 94122\n", - " ...\n", - " Subscriber\n", - " <NA>\n", - " <NA>\n", - " <NA>\n", - " <NA>\n", + " 20171217135737144\n", + " 1072\n", + " 2017-12-17 13:57:37+00:00\n", + " 10th Ave at E 15th St\n", + " 222\n", + " 2017-12-17 14:15:30+00:00\n", + " 10th Ave at E 15th St\n", + " 222\n", + " 144\n", " <NA>\n", + " ...\n", " <NA>\n", + " 37.792714\n", + " -122.24878\n", + " 37.792714\n", + " -122.24878\n", + " 1984\n", + " Male\n", " <NA>\n", - " None\n", - " None\n", + " POINT (-122.24878 37.79271)\n", + " POINT (-122.24878 37.79271)\n", " \n", " \n", " 2\n", - " 20170702115603836\n", - " 16695\n", - " 2017-07-02 11:56:03+00:00\n", - " Union Square (Powell St at Post St)\n", - " 324\n", - " 2017-07-02 16:34:19+00:00\n", - " Union Square (Powell St at Post St)\n", - " 324\n", - " 836\n", + " 201803261642393539\n", + " 486\n", + " 2018-03-26 16:42:39+00:00\n", + " 10th St at Fallon St\n", + " 201\n", + " 2018-03-26 16:50:46+00:00\n", + " 10th Ave at E 15th St\n", + " 222\n", + " 3539\n", " <NA>\n", " ...\n", " <NA>\n", - " 37.7883\n", - " -122.408531\n", - " 37.7883\n", - " -122.408531\n", - " <NA>\n", - " <NA>\n", - " <NA>\n", - " POINT (-122.40853 37.7883)\n", - " POINT (-122.40853 37.7883)\n", + " 37.797673\n", + " -122.262997\n", + " 37.792714\n", + " -122.24878\n", + " 1984\n", + " Male\n", + " Yes\n", + " POINT (-122.263 37.79767)\n", + " POINT (-122.24878 37.79271)\n", " \n", " \n", " 3\n", - " 1066810\n", - " 953\n", - " 2016-01-21 08:24:00+00:00\n", - " Civic Center BART (7th at Market)\n", - " 72\n", - " 2016-01-21 08:40:00+00:00\n", - " Embarcadero at Sansome\n", - " 60\n", - " 212\n", - " 94103\n", - " ...\n", - " Subscriber\n", - " <NA>\n", - " <NA>\n", - " <NA>\n", - " <NA>\n", - " <NA>\n", + " 201802281657253632\n", + " 560\n", + " 2018-02-28 16:57:25+00:00\n", + " 10th St at Fallon St\n", + " 201\n", + " 2018-02-28 17:06:46+00:00\n", + " 10th Ave at E 15th St\n", + " 222\n", + " 3632\n", " <NA>\n", + " ...\n", " <NA>\n", - " None\n", - " None\n", + " 37.797673\n", + " -122.262997\n", + " 37.792714\n", + " -122.24878\n", + " 1984\n", + " Male\n", + " Yes\n", + " POINT (-122.263 37.79767)\n", + " POINT (-122.24878 37.79271)\n", " \n", " \n", " 4\n", - " 220481\n", - " 679\n", - " 2014-03-19 19:20:00+00:00\n", - " San Francisco Caltrain 2 (330 Townsend)\n", - " 69\n", - " 2014-03-19 19:31:00+00:00\n", - " Civic Center BART (7th at Market)\n", - " 72\n", - " 478\n", - " 94107\n", - " ...\n", - " Subscriber\n", - " <NA>\n", - " <NA>\n", + " 201708152357422491\n", + " 965\n", + " 2017-08-15 23:57:42+00:00\n", + " 10th St at Fallon St\n", + " 201\n", + " 2017-08-16 00:13:48+00:00\n", + " 10th Ave at E 15th St\n", + " 222\n", + " 2491\n", " <NA>\n", + " ...\n", " <NA>\n", + " 37.797673\n", + " -122.262997\n", + " 37.792714\n", + " -122.24878\n", " <NA>\n", " <NA>\n", " <NA>\n", - " None\n", - " None\n", + " POINT (-122.263 37.79767)\n", + " POINT (-122.24878 37.79271)\n", " \n", " \n", " 5\n", - " 738474\n", - " 358\n", - " 2015-04-23 16:45:00+00:00\n", - " 2nd at Folsom\n", - " 62\n", - " 2015-04-23 16:51:00+00:00\n", - " Steuart at Market\n", - " 74\n", - " 443\n", - " 94105\n", - " ...\n", - " Subscriber\n", - " <NA>\n", - " <NA>\n", - " <NA>\n", - " <NA>\n", - " <NA>\n", + " 201801161800473291\n", + " 489\n", + " 2018-01-16 18:00:47+00:00\n", + " 10th St at Fallon St\n", + " 201\n", + " 2018-01-16 18:08:56+00:00\n", + " 10th Ave at E 15th St\n", + " 222\n", + " 3291\n", " <NA>\n", + " ...\n", " <NA>\n", - " None\n", - " None\n", + " 37.797673\n", + " -122.262997\n", + " 37.792714\n", + " -122.24878\n", + " 1984\n", + " Male\n", + " Yes\n", + " POINT (-122.263 37.79767)\n", + " POINT (-122.24878 37.79271)\n", " \n", " \n", " 6\n", - " 229264\n", - " 286\n", - " 2014-03-27 17:56:00+00:00\n", - " Embarcadero at Sansome\n", - " 60\n", - " 2014-03-27 18:01:00+00:00\n", - " Davis at Jackson\n", - " 42\n", - " 342\n", - " 94133\n", - " ...\n", - " Subscriber\n", - " <NA>\n", - " <NA>\n", - " <NA>\n", - " <NA>\n", - " <NA>\n", + " 201802201913231257\n", + " 596\n", + " 2018-02-20 19:13:23+00:00\n", + " 10th St at Fallon St\n", + " 201\n", + " 2018-02-20 19:23:19+00:00\n", + " 10th Ave at E 15th St\n", + " 222\n", + " 1257\n", " <NA>\n", + " ...\n", " <NA>\n", - " None\n", - " None\n", + " 37.797673\n", + " -122.262997\n", + " 37.792714\n", + " -122.24878\n", + " 1984\n", + " Male\n", + " Yes\n", + " POINT (-122.263 37.79767)\n", + " POINT (-122.24878 37.79271)\n", " \n", " \n", " 7\n", - " 352010\n", - " 3621\n", - " 2014-07-06 13:55:00+00:00\n", - " Embarcadero at Sansome\n", - " 60\n", - " 2014-07-06 14:55:00+00:00\n", - " Embarcadero at Sansome\n", - " 60\n", - " 390\n", - " 4038\n", - " ...\n", - " Customer\n", - " <NA>\n", - " <NA>\n", - " <NA>\n", - " <NA>\n", + " 201708242325001279\n", + " 1341\n", + " 2017-08-24 23:25:00+00:00\n", + " 10th St at Fallon St\n", + " 201\n", + " 2017-08-24 23:47:22+00:00\n", + " 10th Ave at E 15th St\n", + " 222\n", + " 1279\n", " <NA>\n", + " ...\n", " <NA>\n", + " 37.797673\n", + " -122.262997\n", + " 37.792714\n", + " -122.24878\n", + " 1969\n", + " Male\n", " <NA>\n", - " None\n", - " None\n", + " POINT (-122.263 37.79767)\n", + " POINT (-122.24878 37.79271)\n", " \n", " \n", " 8\n", - " 156255\n", - " 416\n", - " 2014-01-16 18:06:00+00:00\n", - " Embarcadero at Bryant\n", - " 54\n", - " 2014-01-16 18:13:00+00:00\n", - " San Francisco Caltrain (Townsend at 4th)\n", - " 70\n", - " 510\n", - " 94107\n", - " ...\n", - " Subscriber\n", - " <NA>\n", - " <NA>\n", - " <NA>\n", - " <NA>\n", + " 20170913210653295\n", + " 367\n", + " 2017-09-13 21:06:53+00:00\n", + " 10th St at Fallon St\n", + " 201\n", + " 2017-09-13 21:13:00+00:00\n", + " 10th Ave at E 15th St\n", + " 222\n", + " 295\n", " <NA>\n", + " ...\n", " <NA>\n", + " 37.797673\n", + " -122.262997\n", + " 37.792714\n", + " -122.24878\n", + " 1987\n", + " Male\n", " <NA>\n", - " None\n", - " None\n", + " POINT (-122.263 37.79767)\n", + " POINT (-122.24878 37.79271)\n", " \n", " \n", " 9\n", - " 1040197\n", - " 1054\n", - " 2015-12-15 18:05:00+00:00\n", - " Steuart at Market\n", - " 74\n", - " 2015-12-15 18:22:00+00:00\n", - " San Francisco Caltrain (Townsend at 4th)\n", - " 70\n", - " 700\n", - " 94111\n", - " ...\n", - " Subscriber\n", - " <NA>\n", - " <NA>\n", + " 201708192053311490\n", + " 743\n", + " 2017-08-19 20:53:31+00:00\n", + " 2nd Ave at E 18th St\n", + " 200\n", + " 2017-08-19 21:05:54+00:00\n", + " 10th Ave at E 15th St\n", + " 222\n", + " 1490\n", " <NA>\n", + " ...\n", " <NA>\n", + " 37.800214\n", + " -122.25381\n", + " 37.792714\n", + " -122.24878\n", " <NA>\n", " <NA>\n", " <NA>\n", - " None\n", - " None\n", + " POINT (-122.25381 37.80021)\n", + " POINT (-122.24878 37.79271)\n", " \n", " \n", " 10\n", - " 1152693\n", - " 562\n", - " 2016-04-07 08:18:00+00:00\n", - " San Francisco Caltrain (Townsend at 4th)\n", - " 70\n", - " 2016-04-07 08:27:00+00:00\n", - " Steuart at Market\n", - " 74\n", - " 419\n", - " 94158\n", - " ...\n", - " Subscriber\n", - " <NA>\n", - " <NA>\n", + " 20170810204454839\n", + " 1256\n", + " 2017-08-10 20:44:54+00:00\n", + " 2nd Ave at E 18th St\n", + " 200\n", + " 2017-08-10 21:05:50+00:00\n", + " 10th Ave at E 15th St\n", + " 222\n", + " 839\n", " <NA>\n", + " ...\n", " <NA>\n", + " 37.800214\n", + " -122.25381\n", + " 37.792714\n", + " -122.24878\n", " <NA>\n", " <NA>\n", " <NA>\n", - " None\n", - " None\n", + " POINT (-122.25381 37.80021)\n", + " POINT (-122.24878 37.79271)\n", " \n", " \n", " 11\n", - " 201804191735183401\n", - " 887\n", - " 2018-04-19 17:35:18+00:00\n", - " Montgomery St BART Station (Market St at 2nd St)\n", - " 21\n", - " 2018-04-19 17:50:06+00:00\n", - " Civic Center/UN Plaza BART Station (Market St ...\n", - " 44\n", - " 3401\n", + " 201711181823281960\n", + " 353\n", + " 2017-11-18 18:23:28+00:00\n", + " 2nd Ave at E 18th St\n", + " 200\n", + " 2017-11-18 18:29:22+00:00\n", + " 10th Ave at E 15th St\n", + " 222\n", + " 1960\n", " <NA>\n", " ...\n", " <NA>\n", - " 37.789625\n", - " -122.400811\n", - " 37.781074\n", - " -122.411738\n", - " 1979\n", + " 37.800214\n", + " -122.25381\n", + " 37.792714\n", + " -122.24878\n", + " 1988\n", " Male\n", - " No\n", - " POINT (-122.40081 37.78963)\n", - " POINT (-122.41174 37.78107)\n", + " <NA>\n", + " POINT (-122.25381 37.80021)\n", + " POINT (-122.24878 37.79271)\n", " \n", " \n", " 12\n", - " 209283\n", - " 943\n", - " 2014-03-11 09:01:00+00:00\n", - " South Van Ness at Market\n", - " 66\n", - " 2014-03-11 09:16:00+00:00\n", - " Temporary Transbay Terminal (Howard at Beale)\n", - " 55\n", - " 532\n", - " 94105\n", - " ...\n", - " Subscriber\n", - " <NA>\n", - " <NA>\n", - " <NA>\n", - " <NA>\n", - " <NA>\n", + " 201801111613101305\n", + " 858\n", + " 2018-01-11 16:13:10+00:00\n", + " Frank H Ogawa Plaza\n", + " 7\n", + " 2018-01-11 16:27:28+00:00\n", + " 10th Ave at E 15th St\n", + " 222\n", + " 1305\n", " <NA>\n", + " ...\n", " <NA>\n", - " None\n", - " None\n", + " 37.804562\n", + " -122.271738\n", + " 37.792714\n", + " -122.24878\n", + " 1984\n", + " Male\n", + " Yes\n", + " POINT (-122.27174 37.80456)\n", + " POINT (-122.24878 37.79271)\n", " \n", " \n", " 13\n", - " 201708281404312530\n", - " 389\n", - " 2017-08-28 14:04:31+00:00\n", - " 16th St at Prosper St\n", - " 105\n", - " 2017-08-28 14:11:00+00:00\n", - " Mission Playground\n", - " 121\n", - " 2530\n", + " 201712181738372587\n", + " 807\n", + " 2017-12-18 17:38:37+00:00\n", + " Frank H Ogawa Plaza\n", + " 7\n", + " 2017-12-18 17:52:04+00:00\n", + " 10th Ave at E 15th St\n", + " 222\n", + " 2587\n", " <NA>\n", " ...\n", " <NA>\n", - " 37.764285\n", - " -122.431804\n", - " 37.75921\n", - " -122.421339\n", - " 1981\n", + " 37.804562\n", + " -122.271738\n", + " 37.792714\n", + " -122.24878\n", + " 1984\n", " Male\n", " <NA>\n", - " POINT (-122.4318 37.76428)\n", - " POINT (-122.42134 37.75921)\n", + " POINT (-122.27174 37.80456)\n", + " POINT (-122.24878 37.79271)\n", " \n", " \n", " 14\n", - " 20171124115158841\n", - " 384\n", - " 2017-11-24 11:51:58+00:00\n", - " 2nd Ave at E 18th St\n", - " 200\n", - " 2017-11-24 11:58:23+00:00\n", - " El Embarcadero at Grand Ave\n", - " 197\n", - " 841\n", + " 201803161910283751\n", + " 564\n", + " 2018-03-16 19:10:28+00:00\n", + " Frank H Ogawa Plaza\n", + " 7\n", + " 2018-03-16 19:19:52+00:00\n", + " 10th Ave at E 15th St\n", + " 222\n", + " 3751\n", " <NA>\n", " ...\n", " <NA>\n", - " 37.800214\n", - " -122.25381\n", - " 37.808848\n", - " -122.24968\n", - " 1977\n", - " Female\n", - " <NA>\n", - " POINT (-122.25381 37.80021)\n", - " POINT (-122.24968 37.80885)\n", + " 37.804562\n", + " -122.271738\n", + " 37.792714\n", + " -122.24878\n", + " 1987\n", + " Male\n", + " No\n", + " POINT (-122.27174 37.80456)\n", + " POINT (-122.24878 37.79271)\n", " \n", " \n", " 15\n", - " 1321042\n", - " 874\n", - " 2016-08-18 08:14:00+00:00\n", - " San Francisco Caltrain (Townsend at 4th)\n", - " 70\n", - " 2016-08-18 08:29:00+00:00\n", - " Beale at Market\n", - " 56\n", - " 390\n", - " 95050\n", - " ...\n", - " Subscriber\n", - " <NA>\n", - " <NA>\n", - " <NA>\n", - " <NA>\n", - " <NA>\n", + " 201802241826551215\n", + " 1235\n", + " 2018-02-24 18:26:55+00:00\n", + " Frank H Ogawa Plaza\n", + " 7\n", + " 2018-02-24 18:47:31+00:00\n", + " 10th Ave at E 15th St\n", + " 222\n", + " 1215\n", " <NA>\n", + " ...\n", " <NA>\n", - " None\n", - " None\n", + " 37.804562\n", + " -122.271738\n", + " 37.792714\n", + " -122.24878\n", + " 1969\n", + " Male\n", + " No\n", + " POINT (-122.27174 37.80456)\n", + " POINT (-122.24878 37.79271)\n", " \n", " \n", " 16\n", - " 201712131325183120\n", - " 1376\n", - " 2017-12-13 13:25:18+00:00\n", - " Steuart St at Market St\n", - " 16\n", - " 2017-12-13 13:48:14+00:00\n", - " The Embarcadero at Sansome St\n", - " 6\n", - " 3120\n", + " 20171212152403227\n", + " 854\n", + " 2017-12-12 15:24:03+00:00\n", + " Frank H Ogawa Plaza\n", + " 7\n", + " 2017-12-12 15:38:17+00:00\n", + " 10th Ave at E 15th St\n", + " 222\n", + " 227\n", " <NA>\n", " ...\n", " <NA>\n", - " 37.79413\n", - " -122.39443\n", - " 37.80477\n", - " -122.403234\n", - " <NA>\n", - " <NA>\n", + " 37.804562\n", + " -122.271738\n", + " 37.792714\n", + " -122.24878\n", + " 1984\n", + " Male\n", " <NA>\n", - " POINT (-122.39443 37.79413)\n", - " POINT (-122.40323 37.80477)\n", + " POINT (-122.27174 37.80456)\n", + " POINT (-122.24878 37.79271)\n", " \n", " \n", " 17\n", - " 201708310827151646\n", - " 200\n", - " 2017-08-31 08:27:15+00:00\n", - " Powell St BART Station (Market St at 4th St)\n", - " 3\n", - " 2017-08-31 08:30:36+00:00\n", - " Montgomery St BART Station (Market St at 2nd St)\n", - " 21\n", - " 1646\n", + " 201803091621483450\n", + " 857\n", + " 2018-03-09 16:21:48+00:00\n", + " Frank H Ogawa Plaza\n", + " 7\n", + " 2018-03-09 16:36:06+00:00\n", + " 10th Ave at E 15th St\n", + " 222\n", + " 3450\n", " <NA>\n", " ...\n", " <NA>\n", - " 37.786375\n", - " -122.404904\n", - " 37.789625\n", - " -122.400811\n", - " 1988\n", + " 37.804562\n", + " -122.271738\n", + " 37.792714\n", + " -122.24878\n", + " 1984\n", " Male\n", - " <NA>\n", - " POINT (-122.4049 37.78638)\n", - " POINT (-122.40081 37.78963)\n", + " Yes\n", + " POINT (-122.27174 37.80456)\n", + " POINT (-122.24878 37.79271)\n", " \n", " \n", " 18\n", - " 201801251754102907\n", - " 1490\n", - " 2018-01-25 17:54:10+00:00\n", - " Esprit Park\n", - " 126\n", - " 2018-01-25 18:19:01+00:00\n", - " The Embarcadero at Vallejo St\n", - " 8\n", - " 2907\n", + " 201801021932232717\n", + " 914\n", + " 2018-01-02 19:32:23+00:00\n", + " Frank H Ogawa Plaza\n", + " 7\n", + " 2018-01-02 19:47:38+00:00\n", + " 10th Ave at E 15th St\n", + " 222\n", + " 2717\n", " <NA>\n", " ...\n", " <NA>\n", - " 37.761634\n", - " -122.390648\n", - " 37.799953\n", - " -122.398525\n", - " 1989\n", - " Female\n", - " No\n", - " POINT (-122.39065 37.76163)\n", - " POINT (-122.39852 37.79995)\n", + " 37.804562\n", + " -122.271738\n", + " 37.792714\n", + " -122.24878\n", + " 1984\n", + " Male\n", + " Yes\n", + " POINT (-122.27174 37.80456)\n", + " POINT (-122.24878 37.79271)\n", " \n", " \n", " 19\n", - " 201709230951302222\n", - " 319\n", - " 2017-09-23 09:51:30+00:00\n", - " 7th St at Brannan St\n", - " 79\n", - " 2017-09-23 09:56:49+00:00\n", - " San Francisco Caltrain (Townsend St at 4th St)\n", - " 30\n", - " 2222\n", + " 201803131437033724\n", + " 917\n", + " 2018-03-13 14:37:03+00:00\n", + " Grand Ave at Webster St\n", + " 181\n", + " 2018-03-13 14:52:20+00:00\n", + " 10th Ave at E 15th St\n", + " 222\n", + " 3724\n", " <NA>\n", " ...\n", " <NA>\n", - " 37.773492\n", - " -122.403672\n", - " 37.776598\n", - " -122.395282\n", - " 1975\n", + " 37.811377\n", + " -122.265192\n", + " 37.792714\n", + " -122.24878\n", + " 1989\n", " Male\n", - " <NA>\n", - " POINT (-122.40367 37.77349)\n", - " POINT (-122.39528 37.7766)\n", + " No\n", + " POINT (-122.26519 37.81138)\n", + " POINT (-122.24878 37.79271)\n", " \n", " \n", " 20\n", - " 20180220172815415\n", - " 4009\n", - " 2018-02-20 17:28:15+00:00\n", - " Franklin St at 9th St\n", - " 162\n", - " 2018-02-20 18:35:05+00:00\n", - " Telegraph Ave at 27th St\n", - " 179\n", - " 415\n", + " 20170930184510496\n", + " 1367\n", + " 2017-09-30 18:45:10+00:00\n", + " Lake Merritt BART Station\n", + " 163\n", + " 2017-09-30 19:07:58+00:00\n", + " 10th Ave at E 15th St\n", + " 222\n", + " 496\n", " <NA>\n", " ...\n", " <NA>\n", - " 37.800516\n", - " -122.27208\n", - " 37.816073\n", - " -122.267886\n", - " 1973\n", - " Male\n", - " Yes\n", - " POINT (-122.27208 37.80052)\n", - " POINT (-122.26789 37.81607)\n", + " 37.79732\n", + " -122.26532\n", + " 37.792714\n", + " -122.24878\n", + " <NA>\n", + " <NA>\n", + " <NA>\n", + " POINT (-122.26532 37.79732)\n", + " POINT (-122.24878 37.79271)\n", " \n", " \n", " 21\n", - " 201710191714443003\n", - " 691\n", - " 2017-10-19 17:14:44+00:00\n", - " Harrison St at 20th St\n", - " 129\n", - " 2017-10-19 17:26:16+00:00\n", - " Valencia St at 22nd St\n", - " 133\n", - " 3003\n", + " 201712061755593426\n", + " 519\n", + " 2017-12-06 17:55:59+00:00\n", + " Lake Merritt BART Station\n", + " 163\n", + " 2017-12-06 18:04:39+00:00\n", + " 10th Ave at E 15th St\n", + " 222\n", + " 3426\n", " <NA>\n", " ...\n", " <NA>\n", - " 37.758862\n", - " -122.412544\n", - " 37.755213\n", - " -122.420975\n", - " 1958\n", + " 37.79732\n", + " -122.26532\n", + " 37.792714\n", + " -122.24878\n", + " 1986\n", " Male\n", " <NA>\n", - " POINT (-122.41254 37.75886)\n", - " POINT (-122.42098 37.75521)\n", + " POINT (-122.26532 37.79732)\n", + " POINT (-122.24878 37.79271)\n", " \n", " \n", " 22\n", - " 595146\n", - " 453\n", - " 2015-01-07 18:34:00+00:00\n", - " Market at 10th\n", - " 67\n", - " 2015-01-07 18:42:00+00:00\n", - " Townsend at 7th\n", - " 65\n", - " 421\n", - " 95014\n", - " ...\n", - " Subscriber\n", - " <NA>\n", - " <NA>\n", - " <NA>\n", - " <NA>\n", + " 201711062204002182\n", + " 420\n", + " 2017-11-06 22:04:00+00:00\n", + " Lake Merritt BART Station\n", + " 163\n", + " 2017-11-06 22:11:00+00:00\n", + " 10th Ave at E 15th St\n", + " 222\n", + " 2182\n", " <NA>\n", + " ...\n", " <NA>\n", + " 37.79732\n", + " -122.26532\n", + " 37.792714\n", + " -122.24878\n", + " 1992\n", + " Male\n", " <NA>\n", - " None\n", - " None\n", + " POINT (-122.26532 37.79732)\n", + " POINT (-122.24878 37.79271)\n", " \n", " \n", " 23\n", - " 201708290913502454\n", - " 788\n", - " 2017-08-29 09:13:50+00:00\n", - " San Francisco Caltrain (Townsend St at 4th St)\n", - " 30\n", - " 2017-08-29 09:26:58+00:00\n", - " The Embarcadero at Vallejo St\n", - " 8\n", - " 2454\n", + " 201709122036152238\n", + " 612\n", + " 2017-09-12 20:36:15+00:00\n", + " Lake Merritt BART Station\n", + " 163\n", + " 2017-09-12 20:46:27+00:00\n", + " 10th Ave at E 15th St\n", + " 222\n", + " 2238\n", " <NA>\n", " ...\n", " <NA>\n", - " 37.776598\n", - " -122.395282\n", - " 37.799953\n", - " -122.398525\n", - " 1979\n", + " 37.79732\n", + " -122.26532\n", + " 37.792714\n", + " -122.24878\n", + " 1984\n", " Male\n", " <NA>\n", - " POINT (-122.39528 37.7766)\n", - " POINT (-122.39852 37.79995)\n", + " POINT (-122.26532 37.79732)\n", + " POINT (-122.24878 37.79271)\n", " \n", " \n", " 24\n", - " 201712271150433036\n", - " 150\n", - " 2017-12-27 11:50:43+00:00\n", - " Powell St BART Station (Market St at 4th St)\n", - " 3\n", - " 2017-12-27 11:53:14+00:00\n", - " 4th St at Harrison St\n", - " 47\n", - " 3036\n", + " 201712062310481332\n", + " 442\n", + " 2017-12-06 23:10:48+00:00\n", + " Lake Merritt BART Station\n", + " 163\n", + " 2017-12-06 23:18:11+00:00\n", + " 10th Ave at E 15th St\n", + " 222\n", + " 1332\n", " <NA>\n", " ...\n", " <NA>\n", - " 37.786375\n", - " -122.404904\n", - " 37.780955\n", - " -122.399749\n", - " 1989\n", + " 37.79732\n", + " -122.26532\n", + " 37.792714\n", + " -122.24878\n", + " 1981\n", " Male\n", " <NA>\n", - " POINT (-122.4049 37.78638)\n", - " POINT (-122.39975 37.78095)\n", + " POINT (-122.26532 37.79732)\n", + " POINT (-122.24878 37.79271)\n", " \n", " \n", "\n", @@ -742,221 +746,194 @@ "[1947417 rows x 21 columns in total]" ], "text/plain": [ - " trip_id duration_sec start_date \\\n", - "0 1304531 597 2016-08-05 10:55:00+00:00 \n", - "1 184870 403 2014-02-14 14:50:00+00:00 \n", - "2 20170702115603836 16695 2017-07-02 11:56:03+00:00 \n", - "3 1066810 953 2016-01-21 08:24:00+00:00 \n", - "4 220481 679 2014-03-19 19:20:00+00:00 \n", - "5 738474 358 2015-04-23 16:45:00+00:00 \n", - "6 229264 286 2014-03-27 17:56:00+00:00 \n", - "7 352010 3621 2014-07-06 13:55:00+00:00 \n", - "8 156255 416 2014-01-16 18:06:00+00:00 \n", - "9 1040197 1054 2015-12-15 18:05:00+00:00 \n", - "10 1152693 562 2016-04-07 08:18:00+00:00 \n", - "11 201804191735183401 887 2018-04-19 17:35:18+00:00 \n", - "12 209283 943 2014-03-11 09:01:00+00:00 \n", - "13 201708281404312530 389 2017-08-28 14:04:31+00:00 \n", - "14 20171124115158841 384 2017-11-24 11:51:58+00:00 \n", - "15 1321042 874 2016-08-18 08:14:00+00:00 \n", - "16 201712131325183120 1376 2017-12-13 13:25:18+00:00 \n", - "17 201708310827151646 200 2017-08-31 08:27:15+00:00 \n", - "18 201801251754102907 1490 2018-01-25 17:54:10+00:00 \n", - "19 201709230951302222 319 2017-09-23 09:51:30+00:00 \n", - "20 20180220172815415 4009 2018-02-20 17:28:15+00:00 \n", - "21 201710191714443003 691 2017-10-19 17:14:44+00:00 \n", - "22 595146 453 2015-01-07 18:34:00+00:00 \n", - "23 201708290913502454 788 2017-08-29 09:13:50+00:00 \n", - "24 201712271150433036 150 2017-12-27 11:50:43+00:00 \n", + " trip_id duration_sec start_date \\\n", + "201802092135083596 788 2018-02-09 21:35:08+00:00 \n", + " 20171217135737144 1072 2017-12-17 13:57:37+00:00 \n", + "201803261642393539 486 2018-03-26 16:42:39+00:00 \n", + "201802281657253632 560 2018-02-28 16:57:25+00:00 \n", + "201708152357422491 965 2017-08-15 23:57:42+00:00 \n", + "201801161800473291 489 2018-01-16 18:00:47+00:00 \n", + "201802201913231257 596 2018-02-20 19:13:23+00:00 \n", + "201708242325001279 1341 2017-08-24 23:25:00+00:00 \n", + " 20170913210653295 367 2017-09-13 21:06:53+00:00 \n", + "201708192053311490 743 2017-08-19 20:53:31+00:00 \n", + " 20170810204454839 1256 2017-08-10 20:44:54+00:00 \n", + "201711181823281960 353 2017-11-18 18:23:28+00:00 \n", + "201801111613101305 858 2018-01-11 16:13:10+00:00 \n", + "201712181738372587 807 2017-12-18 17:38:37+00:00 \n", + "201803161910283751 564 2018-03-16 19:10:28+00:00 \n", + "201802241826551215 1235 2018-02-24 18:26:55+00:00 \n", + " 20171212152403227 854 2017-12-12 15:24:03+00:00 \n", + "201803091621483450 857 2018-03-09 16:21:48+00:00 \n", + "201801021932232717 914 2018-01-02 19:32:23+00:00 \n", + "201803131437033724 917 2018-03-13 14:37:03+00:00 \n", + " 20170930184510496 1367 2017-09-30 18:45:10+00:00 \n", + "201712061755593426 519 2017-12-06 17:55:59+00:00 \n", + "201711062204002182 420 2017-11-06 22:04:00+00:00 \n", + "201709122036152238 612 2017-09-12 20:36:15+00:00 \n", + "201712062310481332 442 2017-12-06 23:10:48+00:00 \n", "\n", - " start_station_name start_station_id \\\n", - "0 San Francisco Caltrain 2 (330 Townsend) 69 \n", - "1 Howard at 2nd 63 \n", - "2 Union Square (Powell St at Post St) 324 \n", - "3 Civic Center BART (7th at Market) 72 \n", - "4 San Francisco Caltrain 2 (330 Townsend) 69 \n", - "5 2nd at Folsom 62 \n", - "6 Embarcadero at Sansome 60 \n", - "7 Embarcadero at Sansome 60 \n", - "8 Embarcadero at Bryant 54 \n", - "9 Steuart at Market 74 \n", - "10 San Francisco Caltrain (Townsend at 4th) 70 \n", - "11 Montgomery St BART Station (Market St at 2nd St) 21 \n", - "12 South Van Ness at Market 66 \n", - "13 16th St at Prosper St 105 \n", - "14 2nd Ave at E 18th St 200 \n", - "15 San Francisco Caltrain (Townsend at 4th) 70 \n", - "16 Steuart St at Market St 16 \n", - "17 Powell St BART Station (Market St at 4th St) 3 \n", - "18 Esprit Park 126 \n", - "19 7th St at Brannan St 79 \n", - "20 Franklin St at 9th St 162 \n", - "21 Harrison St at 20th St 129 \n", - "22 Market at 10th 67 \n", - "23 San Francisco Caltrain (Townsend St at 4th St) 30 \n", - "24 Powell St BART Station (Market St at 4th St) 3 \n", + " start_station_name start_station_id end_date \\\n", + " 10th Ave at E 15th St 222 2018-02-09 21:48:17+00:00 \n", + " 10th Ave at E 15th St 222 2017-12-17 14:15:30+00:00 \n", + " 10th St at Fallon St 201 2018-03-26 16:50:46+00:00 \n", + " 10th St at Fallon St 201 2018-02-28 17:06:46+00:00 \n", + " 10th St at Fallon St 201 2017-08-16 00:13:48+00:00 \n", + " 10th St at Fallon St 201 2018-01-16 18:08:56+00:00 \n", + " 10th St at Fallon St 201 2018-02-20 19:23:19+00:00 \n", + " 10th St at Fallon St 201 2017-08-24 23:47:22+00:00 \n", + " 10th St at Fallon St 201 2017-09-13 21:13:00+00:00 \n", + " 2nd Ave at E 18th St 200 2017-08-19 21:05:54+00:00 \n", + " 2nd Ave at E 18th St 200 2017-08-10 21:05:50+00:00 \n", + " 2nd Ave at E 18th St 200 2017-11-18 18:29:22+00:00 \n", + " Frank H Ogawa Plaza 7 2018-01-11 16:27:28+00:00 \n", + " Frank H Ogawa Plaza 7 2017-12-18 17:52:04+00:00 \n", + " Frank H Ogawa Plaza 7 2018-03-16 19:19:52+00:00 \n", + " Frank H Ogawa Plaza 7 2018-02-24 18:47:31+00:00 \n", + " Frank H Ogawa Plaza 7 2017-12-12 15:38:17+00:00 \n", + " Frank H Ogawa Plaza 7 2018-03-09 16:36:06+00:00 \n", + " Frank H Ogawa Plaza 7 2018-01-02 19:47:38+00:00 \n", + " Grand Ave at Webster St 181 2018-03-13 14:52:20+00:00 \n", + "Lake Merritt BART Station 163 2017-09-30 19:07:58+00:00 \n", + "Lake Merritt BART Station 163 2017-12-06 18:04:39+00:00 \n", + "Lake Merritt BART Station 163 2017-11-06 22:11:00+00:00 \n", + "Lake Merritt BART Station 163 2017-09-12 20:46:27+00:00 \n", + "Lake Merritt BART Station 163 2017-12-06 23:18:11+00:00 \n", "\n", - " end_date \\\n", - "0 2016-08-05 11:05:00+00:00 \n", - "1 2014-02-14 14:56:00+00:00 \n", - "2 2017-07-02 16:34:19+00:00 \n", - "3 2016-01-21 08:40:00+00:00 \n", - "4 2014-03-19 19:31:00+00:00 \n", - "5 2015-04-23 16:51:00+00:00 \n", - "6 2014-03-27 18:01:00+00:00 \n", - "7 2014-07-06 14:55:00+00:00 \n", - "8 2014-01-16 18:13:00+00:00 \n", - "9 2015-12-15 18:22:00+00:00 \n", - "10 2016-04-07 08:27:00+00:00 \n", - "11 2018-04-19 17:50:06+00:00 \n", - "12 2014-03-11 09:16:00+00:00 \n", - "13 2017-08-28 14:11:00+00:00 \n", - "14 2017-11-24 11:58:23+00:00 \n", - "15 2016-08-18 08:29:00+00:00 \n", - "16 2017-12-13 13:48:14+00:00 \n", - "17 2017-08-31 08:30:36+00:00 \n", - "18 2018-01-25 18:19:01+00:00 \n", - "19 2017-09-23 09:56:49+00:00 \n", - "20 2018-02-20 18:35:05+00:00 \n", - "21 2017-10-19 17:26:16+00:00 \n", - "22 2015-01-07 18:42:00+00:00 \n", - "23 2017-08-29 09:26:58+00:00 \n", - "24 2017-12-27 11:53:14+00:00 \n", + " end_station_name end_station_id bike_number zip_code ... \\\n", + "10th Ave at E 15th St 222 3596 ... \n", + "10th Ave at E 15th St 222 144 ... \n", + "10th Ave at E 15th St 222 3539 ... \n", + "10th Ave at E 15th St 222 3632 ... \n", + "10th Ave at E 15th St 222 2491 ... \n", + "10th Ave at E 15th St 222 3291 ... \n", + "10th Ave at E 15th St 222 1257 ... \n", + "10th Ave at E 15th St 222 1279 ... \n", + "10th Ave at E 15th St 222 295 ... \n", + "10th Ave at E 15th St 222 1490 ... \n", + "10th Ave at E 15th St 222 839 ... \n", + "10th Ave at E 15th St 222 1960 ... \n", + "10th Ave at E 15th St 222 1305 ... \n", + "10th Ave at E 15th St 222 2587 ... \n", + "10th Ave at E 15th St 222 3751 ... \n", + "10th Ave at E 15th St 222 1215 ... \n", + "10th Ave at E 15th St 222 227 ... \n", + "10th Ave at E 15th St 222 3450 ... \n", + "10th Ave at E 15th St 222 2717 ... \n", + "10th Ave at E 15th St 222 3724 ... \n", + "10th Ave at E 15th St 222 496 ... \n", + "10th Ave at E 15th St 222 3426 ... \n", + "10th Ave at E 15th St 222 2182 ... \n", + "10th Ave at E 15th St 222 2238 ... \n", + "10th Ave at E 15th St 222 1332 ... \n", "\n", - " end_station_name end_station_id \\\n", - "0 Powell Street BART 39 \n", - "1 Commercial at Montgomery 45 \n", - "2 Union Square (Powell St at Post St) 324 \n", - "3 Embarcadero at Sansome 60 \n", - "4 Civic Center BART (7th at Market) 72 \n", - "5 Steuart at Market 74 \n", - "6 Davis at Jackson 42 \n", - "7 Embarcadero at Sansome 60 \n", - "8 San Francisco Caltrain (Townsend at 4th) 70 \n", - "9 San Francisco Caltrain (Townsend at 4th) 70 \n", - "10 Steuart at Market 74 \n", - "11 Civic Center/UN Plaza BART Station (Market St ... 44 \n", - "12 Temporary Transbay Terminal (Howard at Beale) 55 \n", - "13 Mission Playground 121 \n", - "14 El Embarcadero at Grand Ave 197 \n", - "15 Beale at Market 56 \n", - "16 The Embarcadero at Sansome St 6 \n", - "17 Montgomery St BART Station (Market St at 2nd St) 21 \n", - "18 The Embarcadero at Vallejo St 8 \n", - "19 San Francisco Caltrain (Townsend St at 4th St) 30 \n", - "20 Telegraph Ave at 27th St 179 \n", - "21 Valencia St at 22nd St 133 \n", - "22 Townsend at 7th 65 \n", - "23 The Embarcadero at Vallejo St 8 \n", - "24 4th St at Harrison St 47 \n", + "c_subscription_type start_station_latitude start_station_longitude \\\n", + " 37.792714 -122.24878 \n", + " 37.792714 -122.24878 \n", + " 37.797673 -122.262997 \n", + " 37.797673 -122.262997 \n", + " 37.797673 -122.262997 \n", + " 37.797673 -122.262997 \n", + " 37.797673 -122.262997 \n", + " 37.797673 -122.262997 \n", + " 37.797673 -122.262997 \n", + " 37.800214 -122.25381 \n", + " 37.800214 -122.25381 \n", + " 37.800214 -122.25381 \n", + " 37.804562 -122.271738 \n", + " 37.804562 -122.271738 \n", + " 37.804562 -122.271738 \n", + " 37.804562 -122.271738 \n", + " 37.804562 -122.271738 \n", + " 37.804562 -122.271738 \n", + " 37.804562 -122.271738 \n", + " 37.811377 -122.265192 \n", + " 37.79732 -122.26532 \n", + " 37.79732 -122.26532 \n", + " 37.79732 -122.26532 \n", + " 37.79732 -122.26532 \n", + " 37.79732 -122.26532 \n", "\n", - " bike_number zip_code ... c_subscription_type start_station_latitude \\\n", - "0 214 95121 ... Subscriber \n", - "1 342 94122 ... Subscriber \n", - "2 836 ... 37.7883 \n", - "3 212 94103 ... Subscriber \n", - "4 478 94107 ... Subscriber \n", - "5 443 94105 ... Subscriber \n", - "6 342 94133 ... Subscriber \n", - "7 390 4038 ... Customer \n", - "8 510 94107 ... Subscriber \n", - "9 700 94111 ... Subscriber \n", - "10 419 94158 ... Subscriber \n", - "11 3401 ... 37.789625 \n", - "12 532 94105 ... Subscriber \n", - "13 2530 ... 37.764285 \n", - "14 841 ... 37.800214 \n", - "15 390 95050 ... Subscriber \n", - "16 3120 ... 37.79413 \n", - "17 1646 ... 37.786375 \n", - "18 2907 ... 37.761634 \n", - "19 2222 ... 37.773492 \n", - "20 415 ... 37.800516 \n", - "21 3003 ... 37.758862 \n", - "22 421 95014 ... Subscriber \n", - "23 2454 ... 37.776598 \n", - "24 3036 ... 37.786375 \n", + " end_station_latitude end_station_longitude member_birth_year \\\n", + " 37.792714 -122.24878 1984 \n", + " 37.792714 -122.24878 1984 \n", + " 37.792714 -122.24878 1984 \n", + " 37.792714 -122.24878 1984 \n", + " 37.792714 -122.24878 \n", + " 37.792714 -122.24878 1984 \n", + " 37.792714 -122.24878 1984 \n", + " 37.792714 -122.24878 1969 \n", + " 37.792714 -122.24878 1987 \n", + " 37.792714 -122.24878 \n", + " 37.792714 -122.24878 \n", + " 37.792714 -122.24878 1988 \n", + " 37.792714 -122.24878 1984 \n", + " 37.792714 -122.24878 1984 \n", + " 37.792714 -122.24878 1987 \n", + " 37.792714 -122.24878 1969 \n", + " 37.792714 -122.24878 1984 \n", + " 37.792714 -122.24878 1984 \n", + " 37.792714 -122.24878 1984 \n", + " 37.792714 -122.24878 1989 \n", + " 37.792714 -122.24878 \n", + " 37.792714 -122.24878 1986 \n", + " 37.792714 -122.24878 1992 \n", + " 37.792714 -122.24878 1984 \n", + " 37.792714 -122.24878 1981 \n", "\n", - " start_station_longitude end_station_latitude end_station_longitude \\\n", - "0 \n", - "1 \n", - "2 -122.408531 37.7883 -122.408531 \n", - "3 \n", - "4 \n", - "5 \n", - "6 \n", - "7 \n", - "8 \n", - "9 \n", - "10 \n", - "11 -122.400811 37.781074 -122.411738 \n", - "12 \n", - "13 -122.431804 37.75921 -122.421339 \n", - "14 -122.25381 37.808848 -122.24968 \n", - "15 \n", - "16 -122.39443 37.80477 -122.403234 \n", - "17 -122.404904 37.789625 -122.400811 \n", - "18 -122.390648 37.799953 -122.398525 \n", - "19 -122.403672 37.776598 -122.395282 \n", - "20 -122.27208 37.816073 -122.267886 \n", - "21 -122.412544 37.755213 -122.420975 \n", - "22 \n", - "23 -122.395282 37.799953 -122.398525 \n", - "24 -122.404904 37.780955 -122.399749 \n", + " member_gender bike_share_for_all_trip start_station_geom \\\n", + " Male Yes POINT (-122.24878 37.79271) \n", + " Male POINT (-122.24878 37.79271) \n", + " Male Yes POINT (-122.263 37.79767) \n", + " Male Yes POINT (-122.263 37.79767) \n", + " POINT (-122.263 37.79767) \n", + " Male Yes POINT (-122.263 37.79767) \n", + " Male Yes POINT (-122.263 37.79767) \n", + " Male POINT (-122.263 37.79767) \n", + " Male POINT (-122.263 37.79767) \n", + " POINT (-122.25381 37.80021) \n", + " POINT (-122.25381 37.80021) \n", + " Male POINT (-122.25381 37.80021) \n", + " Male Yes POINT (-122.27174 37.80456) \n", + " Male POINT (-122.27174 37.80456) \n", + " Male No POINT (-122.27174 37.80456) \n", + " Male No POINT (-122.27174 37.80456) \n", + " Male POINT (-122.27174 37.80456) \n", + " Male Yes POINT (-122.27174 37.80456) \n", + " Male Yes POINT (-122.27174 37.80456) \n", + " Male No POINT (-122.26519 37.81138) \n", + " POINT (-122.26532 37.79732) \n", + " Male POINT (-122.26532 37.79732) \n", + " Male POINT (-122.26532 37.79732) \n", + " Male POINT (-122.26532 37.79732) \n", + " Male POINT (-122.26532 37.79732) \n", "\n", - " member_birth_year member_gender bike_share_for_all_trip \\\n", - "0 \n", - "1 \n", - "2 \n", - "3 \n", - "4 \n", - "5 \n", - "6 \n", - "7 \n", - "8 \n", - "9 \n", - "10 \n", - "11 1979 Male No \n", - "12 \n", - "13 1981 Male \n", - "14 1977 Female \n", - "15 \n", - "16 \n", - "17 1988 Male \n", - "18 1989 Female No \n", - "19 1975 Male \n", - "20 1973 Male Yes \n", - "21 1958 Male \n", - "22 \n", - "23 1979 Male \n", - "24 1989 Male \n", - "\n", - " start_station_geom end_station_geom \n", - "0 None None \n", - "1 None None \n", - "2 POINT (-122.40853 37.7883) POINT (-122.40853 37.7883) \n", - "3 None None \n", - "4 None None \n", - "5 None None \n", - "6 None None \n", - "7 None None \n", - "8 None None \n", - "9 None None \n", - "10 None None \n", - "11 POINT (-122.40081 37.78963) POINT (-122.41174 37.78107) \n", - "12 None None \n", - "13 POINT (-122.4318 37.76428) POINT (-122.42134 37.75921) \n", - "14 POINT (-122.25381 37.80021) POINT (-122.24968 37.80885) \n", - "15 None None \n", - "16 POINT (-122.39443 37.79413) POINT (-122.40323 37.80477) \n", - "17 POINT (-122.4049 37.78638) POINT (-122.40081 37.78963) \n", - "18 POINT (-122.39065 37.76163) POINT (-122.39852 37.79995) \n", - "19 POINT (-122.40367 37.77349) POINT (-122.39528 37.7766) \n", - "20 POINT (-122.27208 37.80052) POINT (-122.26789 37.81607) \n", - "21 POINT (-122.41254 37.75886) POINT (-122.42098 37.75521) \n", - "22 None None \n", - "23 POINT (-122.39528 37.7766) POINT (-122.39852 37.79995) \n", - "24 POINT (-122.4049 37.78638) POINT (-122.39975 37.78095) \n", + " end_station_geom \n", + "POINT (-122.24878 37.79271) \n", + "POINT (-122.24878 37.79271) \n", + "POINT (-122.24878 37.79271) \n", + "POINT (-122.24878 37.79271) \n", + "POINT (-122.24878 37.79271) \n", + "POINT (-122.24878 37.79271) \n", + "POINT (-122.24878 37.79271) \n", + "POINT (-122.24878 37.79271) \n", + "POINT (-122.24878 37.79271) \n", + "POINT (-122.24878 37.79271) \n", + "POINT (-122.24878 37.79271) \n", + "POINT (-122.24878 37.79271) \n", + "POINT (-122.24878 37.79271) \n", + "POINT (-122.24878 37.79271) \n", + "POINT (-122.24878 37.79271) \n", + "POINT (-122.24878 37.79271) \n", + "POINT (-122.24878 37.79271) \n", + "POINT (-122.24878 37.79271) \n", + "POINT (-122.24878 37.79271) \n", + "POINT (-122.24878 37.79271) \n", + "POINT (-122.24878 37.79271) \n", + "POINT (-122.24878 37.79271) \n", + "POINT (-122.24878 37.79271) \n", + "POINT (-122.24878 37.79271) \n", + "POINT (-122.24878 37.79271) \n", "...\n", "\n", "[1947417 rows x 21 columns]" @@ -1167,32 +1144,32 @@ "[2842 rows x 2 columns in total]" ], "text/plain": [ - " trip_hour num_trips\n", - "0 2018-01-01 00:00:00+00:00 20\n", - "1 2018-01-01 01:00:00+00:00 25\n", - "2 2018-01-01 02:00:00+00:00 13\n", - "3 2018-01-01 03:00:00+00:00 11\n", - "4 2018-01-01 05:00:00+00:00 4\n", - "5 2018-01-01 06:00:00+00:00 8\n", - "6 2018-01-01 07:00:00+00:00 8\n", - "7 2018-01-01 08:00:00+00:00 20\n", - "8 2018-01-01 09:00:00+00:00 30\n", - "9 2018-01-01 10:00:00+00:00 41\n", - "10 2018-01-01 11:00:00+00:00 45\n", - "11 2018-01-01 12:00:00+00:00 54\n", - "12 2018-01-01 13:00:00+00:00 57\n", - "13 2018-01-01 14:00:00+00:00 68\n", - "14 2018-01-01 15:00:00+00:00 86\n", - "15 2018-01-01 16:00:00+00:00 72\n", - "16 2018-01-01 17:00:00+00:00 72\n", - "17 2018-01-01 18:00:00+00:00 47\n", - "18 2018-01-01 19:00:00+00:00 32\n", - "19 2018-01-01 20:00:00+00:00 34\n", - "20 2018-01-01 21:00:00+00:00 27\n", - "21 2018-01-01 22:00:00+00:00 15\n", - "22 2018-01-01 23:00:00+00:00 6\n", - "23 2018-01-02 00:00:00+00:00 2\n", - "24 2018-01-02 01:00:00+00:00 1\n", + " trip_hour num_trips\n", + "2018-01-01 00:00:00+00:00 20\n", + "2018-01-01 01:00:00+00:00 25\n", + "2018-01-01 02:00:00+00:00 13\n", + "2018-01-01 03:00:00+00:00 11\n", + "2018-01-01 05:00:00+00:00 4\n", + "2018-01-01 06:00:00+00:00 8\n", + "2018-01-01 07:00:00+00:00 8\n", + "2018-01-01 08:00:00+00:00 20\n", + "2018-01-01 09:00:00+00:00 30\n", + "2018-01-01 10:00:00+00:00 41\n", + "2018-01-01 11:00:00+00:00 45\n", + "2018-01-01 12:00:00+00:00 54\n", + "2018-01-01 13:00:00+00:00 57\n", + "2018-01-01 14:00:00+00:00 68\n", + "2018-01-01 15:00:00+00:00 86\n", + "2018-01-01 16:00:00+00:00 72\n", + "2018-01-01 17:00:00+00:00 72\n", + "2018-01-01 18:00:00+00:00 47\n", + "2018-01-01 19:00:00+00:00 32\n", + "2018-01-01 20:00:00+00:00 34\n", + "2018-01-01 21:00:00+00:00 27\n", + "2018-01-01 22:00:00+00:00 15\n", + "2018-01-01 23:00:00+00:00 6\n", + "2018-01-02 00:00:00+00:00 2\n", + "2018-01-02 01:00:00+00:00 1\n", "...\n", "\n", "[2842 rows x 2 columns]" @@ -1253,227 +1230,227 @@ " \n", " \n", " 0\n", - " 2018-05-05 01:00:00+00:00\n", - " 50.123672\n", + " 2018-04-26 11:00:00+00:00\n", + " 204.291275\n", " 0.95\n", - " -13.062586\n", - " 113.309931\n", + " 149.151441\n", + " 259.431109\n", " \n", " \n", " \n", " 1\n", - " 2018-05-05 07:00:00+00:00\n", - " 103.112846\n", + " 2018-04-27 13:00:00+00:00\n", + " 196.034332\n", " 0.95\n", - " 33.725954\n", - " 172.499739\n", + " 203.125978\n", + " 188.942686\n", " \n", " \n", " \n", " 2\n", - " 2018-05-03 15:00:00+00:00\n", - " 230.49147\n", + " 2018-04-27 20:00:00+00:00\n", + " 133.339386\n", " 0.95\n", - " 152.635986\n", - " 308.346954\n", + " 132.658946\n", + " 134.019826\n", " \n", " \n", " \n", " 3\n", - " 2018-05-02 08:00:00+00:00\n", - " 737.477356\n", + " 2018-04-28 05:00:00+00:00\n", + " -27.321686\n", " 0.95\n", - " 562.979208\n", - " 911.975504\n", + " -13.918083\n", + " -40.725288\n", " \n", " \n", " \n", " 4\n", - " 2018-05-01 08:00:00+00:00\n", - " 679.980469\n", + " 2018-04-29 12:00:00+00:00\n", + " 117.657822\n", " 0.95\n", - " 479.980134\n", - " 879.980803\n", + " 58.020439\n", + " 177.295205\n", " \n", " \n", " \n", " 5\n", - " 2018-05-06 18:00:00+00:00\n", - " 136.80835\n", + " 2018-04-24 10:00:00+00:00\n", + " 221.464111\n", " 0.95\n", - " -13.813863\n", - " 287.430562\n", + " 154.598621\n", + " 288.329602\n", " \n", " \n", " \n", " 6\n", - " 2018-05-01 11:00:00+00:00\n", - " 120.364288\n", + " 2018-04-24 23:00:00+00:00\n", + " 56.203827\n", " 0.95\n", - " 52.778249\n", - " 187.950328\n", + " 42.096868\n", + " 70.310786\n", " \n", " \n", " \n", " 7\n", - " 2018-05-06 22:00:00+00:00\n", - " 64.722443\n", + " 2018-04-29 07:00:00+00:00\n", + " -14.801514\n", " 0.95\n", - " -55.555842\n", - " 185.000727\n", + " -48.905982\n", + " 19.302954\n", " \n", " \n", " \n", " 8\n", - " 2018-05-03 02:00:00+00:00\n", - " 42.689804\n", + " 2018-04-24 22:00:00+00:00\n", + " 58.174316\n", " 0.95\n", - " 33.258414\n", - " 52.121194\n", + " 85.290985\n", + " 31.057648\n", " \n", " \n", " \n", " 9\n", - " 2018-05-07 17:00:00+00:00\n", - " 594.999084\n", + " 2018-04-25 08:00:00+00:00\n", + " 666.577393\n", " 0.95\n", - " 346.917217\n", - " 843.080952\n", + " 518.655663\n", + " 814.499122\n", " \n", " \n", " \n", " 10\n", - " 2018-05-03 20:00:00+00:00\n", - " 161.822281\n", + " 2018-04-29 01:00:00+00:00\n", + " 40.19632\n", " 0.95\n", - " 100.005942\n", - " 223.63862\n", + " 48.957491\n", + " 31.435148\n", " \n", " \n", " \n", " 11\n", - " 2018-05-01 20:00:00+00:00\n", - " 173.801025\n", + " 2018-04-29 02:00:00+00:00\n", + " 29.00975\n", " 0.95\n", - " 56.460376\n", - " 291.141675\n", + " -8.137303\n", + " 66.156804\n", " \n", " \n", " \n", " 12\n", - " 2018-05-04 17:00:00+00:00\n", - " 485.449829\n", + " 2018-04-30 18:00:00+00:00\n", + " 488.885284\n", " 0.95\n", - " 356.038539\n", - " 614.86112\n", + " 315.531321\n", + " 662.239248\n", " \n", " \n", " \n", " 13\n", - " 2018-05-04 09:00:00+00:00\n", - " 418.055878\n", + " 2018-04-27 10:00:00+00:00\n", + " 188.79628\n", " 0.95\n", - " 281.134736\n", - " 554.977019\n", + " 157.126395\n", + " 220.466165\n", " \n", " \n", " \n", " 14\n", - " 2018-05-07 03:00:00+00:00\n", - " 24.735134\n", + " 2018-04-24 21:00:00+00:00\n", + " 107.512665\n", " 0.95\n", - " -100.607727\n", - " 150.077995\n", + " 108.890078\n", + " 106.135251\n", " \n", " \n", " \n", " 15\n", - " 2018-05-05 11:00:00+00:00\n", - " 186.08136\n", + " 2018-04-28 14:00:00+00:00\n", + " 149.738419\n", " 0.95\n", - " 140.706789\n", - " 231.455931\n", + " 161.696173\n", + " 137.780664\n", " \n", " \n", " \n", " 16\n", - " 2018-05-03 08:00:00+00:00\n", - " 675.380249\n", + " 2018-04-28 20:00:00+00:00\n", + " 71.378677\n", " 0.95\n", - " 532.913707\n", - " 817.846791\n", + " 98.940288\n", + " 43.817067\n", " \n", " \n", " \n", " 17\n", - " 2018-05-02 09:00:00+00:00\n", - " 537.494812\n", + " 2018-04-30 13:00:00+00:00\n", + " 139.673706\n", " 0.95\n", - " 376.406922\n", - " 698.582702\n", + " 66.493742\n", + " 212.85367\n", " \n", " \n", " \n", " 18\n", - " 2018-05-01 12:00:00+00:00\n", - " 101.637169\n", + " 2018-04-24 12:00:00+00:00\n", + " 144.577728\n", " 0.95\n", - " 55.141509\n", - " 148.132829\n", + " 120.01921\n", + " 169.136247\n", " \n", " \n", " \n", " 19\n", - " 2018-05-05 00:00:00+00:00\n", - " 7.469772\n", + " 2018-04-25 00:00:00+00:00\n", + " 54.215515\n", " 0.95\n", - " -23.930392\n", - " 38.869936\n", + " 46.8394\n", + " 61.591631\n", " \n", " \n", " \n", " 20\n", - " 2018-05-02 14:00:00+00:00\n", - " 153.851379\n", + " 2018-04-26 05:00:00+00:00\n", + " 8.140533\n", " 0.95\n", - " 104.224826\n", - " 203.477932\n", + " -14.613272\n", + " 30.894339\n", " \n", " \n", " \n", " 21\n", - " 2018-05-04 13:00:00+00:00\n", - " 162.676117\n", + " 2018-04-26 14:00:00+00:00\n", + " 198.744949\n", " 0.95\n", - " 113.098327\n", - " 212.253907\n", + " 174.982268\n", + " 222.50763\n", " \n", " \n", " \n", " 22\n", - " 2018-05-04 16:00:00+00:00\n", - " 330.643402\n", + " 2018-04-27 02:00:00+00:00\n", + " 9.91806\n", " 0.95\n", - " 205.125168\n", - " 456.161636\n", + " -26.749948\n", + " 46.586069\n", " \n", " \n", " \n", " 23\n", - " 2018-05-04 21:00:00+00:00\n", - " 136.264679\n", + " 2018-04-29 03:00:00+00:00\n", + " 32.063339\n", " 0.95\n", - " 41.947438\n", - " 230.58192\n", + " -35.730978\n", + " 99.857656\n", " \n", " \n", " \n", " 24\n", - " 2018-05-02 17:00:00+00:00\n", - " 675.527222\n", + " 2018-04-27 04:00:00+00:00\n", + " 25.757111\n", " 0.95\n", - " 516.358698\n", - " 834.695746\n", + " 8.178037\n", + " 43.336184\n", " \n", " \n", " \n", @@ -1482,86 +1459,86 @@ "[168 rows x 6 columns in total]" ], "text/plain": [ - " forecast_timestamp forecast_value confidence_level \\\n", - "0 2018-05-05 01:00:00+00:00 50.123672 0.95 \n", - "1 2018-05-05 07:00:00+00:00 103.112846 0.95 \n", - "2 2018-05-03 15:00:00+00:00 230.49147 0.95 \n", - "3 2018-05-02 08:00:00+00:00 737.477356 0.95 \n", - "4 2018-05-01 08:00:00+00:00 679.980469 0.95 \n", - "5 2018-05-06 18:00:00+00:00 136.80835 0.95 \n", - "6 2018-05-01 11:00:00+00:00 120.364288 0.95 \n", - "7 2018-05-06 22:00:00+00:00 64.722443 0.95 \n", - "8 2018-05-03 02:00:00+00:00 42.689804 0.95 \n", - "9 2018-05-07 17:00:00+00:00 594.999084 0.95 \n", - "10 2018-05-03 20:00:00+00:00 161.822281 0.95 \n", - "11 2018-05-01 20:00:00+00:00 173.801025 0.95 \n", - "12 2018-05-04 17:00:00+00:00 485.449829 0.95 \n", - "13 2018-05-04 09:00:00+00:00 418.055878 0.95 \n", - "14 2018-05-07 03:00:00+00:00 24.735134 0.95 \n", - "15 2018-05-05 11:00:00+00:00 186.08136 0.95 \n", - "16 2018-05-03 08:00:00+00:00 675.380249 0.95 \n", - "17 2018-05-02 09:00:00+00:00 537.494812 0.95 \n", - "18 2018-05-01 12:00:00+00:00 101.637169 0.95 \n", - "19 2018-05-05 00:00:00+00:00 7.469772 0.95 \n", - "20 2018-05-02 14:00:00+00:00 153.851379 0.95 \n", - "21 2018-05-04 13:00:00+00:00 162.676117 0.95 \n", - "22 2018-05-04 16:00:00+00:00 330.643402 0.95 \n", - "23 2018-05-04 21:00:00+00:00 136.264679 0.95 \n", - "24 2018-05-02 17:00:00+00:00 675.527222 0.95 \n", + " forecast_timestamp forecast_value confidence_level \\\n", + "2018-04-26 11:00:00+00:00 204.291275 0.95 \n", + "2018-04-27 13:00:00+00:00 196.034332 0.95 \n", + "2018-04-27 20:00:00+00:00 133.339386 0.95 \n", + "2018-04-28 05:00:00+00:00 -27.321686 0.95 \n", + "2018-04-29 12:00:00+00:00 117.657822 0.95 \n", + "2018-04-24 10:00:00+00:00 221.464111 0.95 \n", + "2018-04-24 23:00:00+00:00 56.203827 0.95 \n", + "2018-04-29 07:00:00+00:00 -14.801514 0.95 \n", + "2018-04-24 22:00:00+00:00 58.174316 0.95 \n", + "2018-04-25 08:00:00+00:00 666.577393 0.95 \n", + "2018-04-29 01:00:00+00:00 40.19632 0.95 \n", + "2018-04-29 02:00:00+00:00 29.00975 0.95 \n", + "2018-04-30 18:00:00+00:00 488.885284 0.95 \n", + "2018-04-27 10:00:00+00:00 188.79628 0.95 \n", + "2018-04-24 21:00:00+00:00 107.512665 0.95 \n", + "2018-04-28 14:00:00+00:00 149.738419 0.95 \n", + "2018-04-28 20:00:00+00:00 71.378677 0.95 \n", + "2018-04-30 13:00:00+00:00 139.673706 0.95 \n", + "2018-04-24 12:00:00+00:00 144.577728 0.95 \n", + "2018-04-25 00:00:00+00:00 54.215515 0.95 \n", + "2018-04-26 05:00:00+00:00 8.140533 0.95 \n", + "2018-04-26 14:00:00+00:00 198.744949 0.95 \n", + "2018-04-27 02:00:00+00:00 9.91806 0.95 \n", + "2018-04-29 03:00:00+00:00 32.063339 0.95 \n", + "2018-04-27 04:00:00+00:00 25.757111 0.95 \n", "\n", - " prediction_interval_lower_bound prediction_interval_upper_bound \\\n", - "0 -13.062586 113.309931 \n", - "1 33.725954 172.499739 \n", - "2 152.635986 308.346954 \n", - "3 562.979208 911.975504 \n", - "4 479.980134 879.980803 \n", - "5 -13.813863 287.430562 \n", - "6 52.778249 187.950328 \n", - "7 -55.555842 185.000727 \n", - "8 33.258414 52.121194 \n", - "9 346.917217 843.080952 \n", - "10 100.005942 223.63862 \n", - "11 56.460376 291.141675 \n", - "12 356.038539 614.86112 \n", - "13 281.134736 554.977019 \n", - "14 -100.607727 150.077995 \n", - "15 140.706789 231.455931 \n", - "16 532.913707 817.846791 \n", - "17 376.406922 698.582702 \n", - "18 55.141509 148.132829 \n", - "19 -23.930392 38.869936 \n", - "20 104.224826 203.477932 \n", - "21 113.098327 212.253907 \n", - "22 205.125168 456.161636 \n", - "23 41.947438 230.58192 \n", - "24 516.358698 834.695746 \n", + " prediction_interval_lower_bound prediction_interval_upper_bound \\\n", + " 149.151441 259.431109 \n", + " 203.125978 188.942686 \n", + " 132.658946 134.019826 \n", + " -13.918083 -40.725288 \n", + " 58.020439 177.295205 \n", + " 154.598621 288.329602 \n", + " 42.096868 70.310786 \n", + " -48.905982 19.302954 \n", + " 85.290985 31.057648 \n", + " 518.655663 814.499122 \n", + " 48.957491 31.435148 \n", + " -8.137303 66.156804 \n", + " 315.531321 662.239248 \n", + " 157.126395 220.466165 \n", + " 108.890078 106.135251 \n", + " 161.696173 137.780664 \n", + " 98.940288 43.817067 \n", + " 66.493742 212.85367 \n", + " 120.01921 169.136247 \n", + " 46.8394 61.591631 \n", + " -14.613272 30.894339 \n", + " 174.982268 222.50763 \n", + " -26.749948 46.586069 \n", + " -35.730978 99.857656 \n", + " 8.178037 43.336184 \n", "\n", - " ai_forecast_status \n", - "0 \n", - "1 \n", - "2 \n", - "3 \n", - "4 \n", - "5 \n", - "6 \n", - "7 \n", - "8 \n", - "9 \n", - "10 \n", - "11 \n", - "12 \n", - "13 \n", - "14 \n", - "15 \n", - "16 \n", - "17 \n", - "18 \n", - "19 \n", - "20 \n", - "21 \n", - "22 \n", - "23 \n", - "24 \n", + "ai_forecast_status \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", "...\n", "\n", "[168 rows x 6 columns]" @@ -1573,7 +1550,8 @@ } ], "source": [ - "result = df_grouped.ai.forecast(timestamp_column=\"trip_hour\", data_column=\"num_trips\", horizon=168) # 1 week\n", + "# Using all the data except the last week (2842-168) for training. And predict the last week (168).\n", + "result = df_grouped.head(2842-168).ai.forecast(timestamp_column=\"trip_hour\", data_column=\"num_trips\", horizon=168) \n", "result" ] }, @@ -1597,6 +1575,13 @@ "df_all = df_all.tail(672) # 4 weeks" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Plot a line chart and compare with the actual result." + ] + }, { "cell_type": "code", "execution_count": 8, @@ -1614,7 +1599,7 @@ }, { "data": { - "image/png": "", + "image/png": "", "text/plain": [ "
" ] From be9a89f18319fa1cb1be676fc2033961dc800d16 Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Tue, 24 Jun 2025 11:32:06 -0700 Subject: [PATCH 04/28] chore: add more types for read_gbq_table in conftest (#1843) * chore: add more types for read_gbq_table in conftest * add bug id * undo dtype conversions for repeated_pandas_df * undo unrelated changes * fix json --- bigframes/core/compile/sqlglot/sqlglot_ir.py | 2 +- tests/unit/core/compile/sqlglot/conftest.py | 120 +++++++++++++++++- .../test_compile_concat/out.sql | 24 ++-- .../test_compile_readlocal_w_json_df/out.sql | 7 +- .../out.sql | 10 ++ .../out.sql | 11 ++ .../out.sql | 23 ++++ .../compile/sqlglot/test_compile_readlocal.py | 19 ++- .../compile/sqlglot/test_compile_readtable.py | 14 ++ .../google_cloud_bigquery/_pandas_helpers.py | 2 + 10 files changed, 202 insertions(+), 30 deletions(-) create mode 100644 tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable_w_json_types/out.sql create mode 100644 tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable_w_nested_structs_types/out.sql create mode 100644 tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable_w_repeated_types/out.sql diff --git a/bigframes/core/compile/sqlglot/sqlglot_ir.py b/bigframes/core/compile/sqlglot/sqlglot_ir.py index 3766ab3266..6b805802b0 100644 --- a/bigframes/core/compile/sqlglot/sqlglot_ir.py +++ b/bigframes/core/compile/sqlglot/sqlglot_ir.py @@ -182,7 +182,7 @@ def from_union( selections = [ sge.Alias( - this=expr.alias_or_name, + this=sge.to_identifier(expr.alias_or_name, quoted=cls.quoted), alias=sge.to_identifier(output_id, quoted=cls.quoted), ) for expr, output_id in zip(select_expr.expressions, output_ids) diff --git a/tests/unit/core/compile/sqlglot/conftest.py b/tests/unit/core/compile/sqlglot/conftest.py index 20f8159f01..6d5fac8184 100644 --- a/tests/unit/core/compile/sqlglot/conftest.py +++ b/tests/unit/core/compile/sqlglot/conftest.py @@ -29,24 +29,48 @@ DATA_DIR = CURRENT_DIR.parent.parent.parent.parent / "data" -@pytest.fixture(scope="session") -def compiler_session(scalars_types_table_schema): +def _create_compiler_session(table_name, table_schema): + """Helper function to create a compiler session.""" from bigframes.testing import compiler_session - # TODO: Check if ordering mode is needed for the tests. - table_name = "scalar_types" anonymous_dataset = bigquery.DatasetReference.from_string( "bigframes-dev.sqlglot_test" ) session = mocks.create_bigquery_session( table_name=table_name, - table_schema=scalars_types_table_schema, + table_schema=table_schema, anonymous_dataset=anonymous_dataset, ) session._executor = compiler_session.SQLCompilerExecutor() return session +@pytest.fixture(scope="session") +def compiler_session(scalars_types_table_schema): + """Compiler session for scalar types.""" + return _create_compiler_session("scalar_types", scalars_types_table_schema) + + +@pytest.fixture(scope="session") +def compiler_session_w_repeated_types(repeated_types_table_schema): + """Compiler session for repeated data types.""" + return _create_compiler_session("repeated_types", repeated_types_table_schema) + + +@pytest.fixture(scope="session") +def compiler_session_w_nested_structs_types(nested_structs_types_table_schema): + """Compiler session for nested STRUCT data types.""" + return _create_compiler_session( + "nested_structs_types", nested_structs_types_table_schema + ) + + +@pytest.fixture(scope="session") +def compiler_session_w_json_types(json_types_table_schema): + """Compiler session for JSON data types.""" + return _create_compiler_session("json_types", json_types_table_schema) + + @pytest.fixture(scope="session") def scalars_types_table_schema() -> typing.Sequence[bigquery.SchemaField]: return [ @@ -91,6 +115,40 @@ def scalars_types_pandas_df() -> pd.DataFrame: return df +@pytest.fixture(scope="session") +def nested_structs_types_table_schema() -> typing.Sequence[bigquery.SchemaField]: + return [ + bigquery.SchemaField("id", "INTEGER"), + bigquery.SchemaField( + "people", + "RECORD", + fields=[ + bigquery.SchemaField("name", "STRING"), + bigquery.SchemaField("age", "INTEGER"), + bigquery.SchemaField( + "address", + "RECORD", + fields=[ + bigquery.SchemaField("city", "STRING"), + bigquery.SchemaField("country", "STRING"), + ], + ), + ], + ), + ] + + +@pytest.fixture(scope="session") +def nested_structs_types_df(compiler_session_w_nested_structs_types) -> bpd.DataFrame: + """Returns a BigFrames DataFrame containing all scalar types and using the `rowindex` + column as the index.""" + bf_df = compiler_session_w_nested_structs_types.read_gbq_table( + "bigframes-dev.sqlglot_test.nested_structs_types" + ) + bf_df = bf_df.set_index("id", drop=False) + return bf_df + + @pytest.fixture(scope="session") def nested_structs_pandas_df() -> pd.DataFrame: """Returns a pandas DataFrame containing STRUCT types and using the `id` @@ -117,7 +175,32 @@ def nested_structs_pandas_df() -> pd.DataFrame: @pytest.fixture(scope="session") -def repeated_pandas_df() -> pd.DataFrame: +def repeated_types_table_schema() -> typing.Sequence[bigquery.SchemaField]: + return [ + bigquery.SchemaField("rowindex", "INTEGER"), + bigquery.SchemaField("int_list_col", "INTEGER", "REPEATED"), + bigquery.SchemaField("bool_list_col", "BOOLEAN", "REPEATED"), + bigquery.SchemaField("float_list_col", "FLOAT", "REPEATED"), + bigquery.SchemaField("date_list_col", "DATE", "REPEATED"), + bigquery.SchemaField("date_time_list_col", "DATETIME", "REPEATED"), + bigquery.SchemaField("numeric_list_col", "NUMERIC", "REPEATED"), + bigquery.SchemaField("string_list_col", "STRING", "REPEATED"), + ] + + +@pytest.fixture(scope="session") +def repeated_types_df(compiler_session_w_repeated_types) -> bpd.DataFrame: + """Returns a BigFrames DataFrame containing all scalar types and using the `rowindex` + column as the index.""" + bf_df = compiler_session_w_repeated_types.read_gbq_table( + "bigframes-dev.sqlglot_test.repeated_types" + ) + bf_df = bf_df.set_index("rowindex", drop=False) + return bf_df + + +@pytest.fixture(scope="session") +def repeated_types_pandas_df() -> pd.DataFrame: """Returns a pandas DataFrame containing LIST types and using the `rowindex` column as the index.""" @@ -125,10 +208,31 @@ def repeated_pandas_df() -> pd.DataFrame: DATA_DIR / "repeated.jsonl", lines=True, ) + # TODO: add dtype conversion here if needed. df = df.set_index("rowindex") return df +@pytest.fixture(scope="session") +def json_types_table_schema() -> typing.Sequence[bigquery.SchemaField]: + return [ + bigquery.SchemaField("rowindex", "INTEGER"), + bigquery.SchemaField("json_col", "JSON"), + ] + + +@pytest.fixture(scope="session") +def json_types_df(compiler_session_w_json_types) -> bpd.DataFrame: + """Returns a BigFrames DataFrame containing JSON types and using the `rowindex` + column as the index.""" + bf_df = compiler_session_w_json_types.read_gbq_table( + "bigframes-dev.sqlglot_test.json_types" + ) + # TODO(b/427305807): Why `drop=False` will produce two "rowindex" columns? + bf_df = bf_df.set_index("rowindex", drop=True) + return bf_df + + @pytest.fixture(scope="session") def json_pandas_df() -> pd.DataFrame: """Returns a pandas DataFrame containing JSON types and using the `rowindex` @@ -149,8 +253,10 @@ def json_pandas_df() -> pd.DataFrame: ] df = pd.DataFrame( { + "rowindex": pd.Series(range(len(json_data)), dtype=dtypes.INT_DTYPE), "json_col": pd.Series(json_data, dtype=dtypes.JSON_DTYPE), }, - index=pd.Series(range(len(json_data)), dtype=dtypes.INT_DTYPE), ) + # TODO(b/427305807): Why `drop=False` will produce two "rowindex" columns? + df = df.set_index("rowindex", drop=True) return df diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_concat/test_compile_concat/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_concat/test_compile_concat/out.sql index 8da545b8fa..62e22a6a19 100644 --- a/tests/unit/core/compile/sqlglot/snapshots/test_compile_concat/test_compile_concat/out.sql +++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_concat/test_compile_concat/out.sql @@ -49,21 +49,21 @@ WITH `bfcte_1` AS ( * FROM ( SELECT - bfcol_17 AS `bfcol_46`, - bfcol_18 AS `bfcol_47`, - bfcol_19 AS `bfcol_48`, - bfcol_20 AS `bfcol_49`, - bfcol_21 AS `bfcol_50`, - bfcol_22 AS `bfcol_51` + `bfcol_17` AS `bfcol_46`, + `bfcol_18` AS `bfcol_47`, + `bfcol_19` AS `bfcol_48`, + `bfcol_20` AS `bfcol_49`, + `bfcol_21` AS `bfcol_50`, + `bfcol_22` AS `bfcol_51` FROM `bfcte_6` UNION ALL SELECT - bfcol_40 AS `bfcol_46`, - bfcol_41 AS `bfcol_47`, - bfcol_42 AS `bfcol_48`, - bfcol_43 AS `bfcol_49`, - bfcol_44 AS `bfcol_50`, - bfcol_45 AS `bfcol_51` + `bfcol_40` AS `bfcol_46`, + `bfcol_41` AS `bfcol_47`, + `bfcol_42` AS `bfcol_48`, + `bfcol_43` AS `bfcol_49`, + `bfcol_44` AS `bfcol_50`, + `bfcol_45` AS `bfcol_51` FROM `bfcte_7` ) ) diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_json_df/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_json_df/out.sql index 100036d75f..4e21266b87 100644 --- a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_json_df/out.sql +++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_json_df/out.sql @@ -1,10 +1,11 @@ WITH `bfcte_0` AS ( SELECT * - FROM UNNEST(ARRAY>[STRUCT(PARSE_JSON('null'), 0), STRUCT(PARSE_JSON('true'), 1), STRUCT(PARSE_JSON('100'), 2), STRUCT(PARSE_JSON('0.98'), 3), STRUCT(PARSE_JSON('"a string"'), 4), STRUCT(PARSE_JSON('[]'), 5), STRUCT(PARSE_JSON('[1,2,3]'), 6), STRUCT(PARSE_JSON('[{"a":1},{"a":2},{"a":null},{}]'), 7), STRUCT(PARSE_JSON('"100"'), 8), STRUCT(PARSE_JSON('{"date":"2024-07-16"}'), 9), STRUCT(PARSE_JSON('{"int_value":2,"null_filed":null}'), 10), STRUCT(PARSE_JSON('{"list_data":[10,20,30]}'), 11)]) + FROM UNNEST(ARRAY>[STRUCT(0, PARSE_JSON('null'), 0), STRUCT(1, PARSE_JSON('true'), 1), STRUCT(2, PARSE_JSON('100'), 2), STRUCT(3, PARSE_JSON('0.98'), 3), STRUCT(4, PARSE_JSON('"a string"'), 4), STRUCT(5, PARSE_JSON('[]'), 5), STRUCT(6, PARSE_JSON('[1,2,3]'), 6), STRUCT(7, PARSE_JSON('[{"a":1},{"a":2},{"a":null},{}]'), 7), STRUCT(8, PARSE_JSON('"100"'), 8), STRUCT(9, PARSE_JSON('{"date":"2024-07-16"}'), 9), STRUCT(10, PARSE_JSON('{"int_value":2,"null_filed":null}'), 10), STRUCT(11, PARSE_JSON('{"list_data":[10,20,30]}'), 11)]) ) SELECT - `bfcol_0` AS `json_col` + `bfcol_0` AS `rowindex`, + `bfcol_1` AS `json_col` FROM `bfcte_0` ORDER BY - `bfcol_1` ASC NULLS LAST \ No newline at end of file + `bfcol_2` ASC NULLS LAST \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable_w_json_types/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable_w_json_types/out.sql new file mode 100644 index 0000000000..4e8f61d75d --- /dev/null +++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable_w_json_types/out.sql @@ -0,0 +1,10 @@ +WITH `bfcte_0` AS ( + SELECT + `rowindex` AS `bfcol_0`, + `json_col` AS `bfcol_1` + FROM `bigframes-dev`.`sqlglot_test`.`json_types` +) +SELECT + `bfcol_0` AS `rowindex`, + `bfcol_1` AS `json_col` +FROM `bfcte_0` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable_w_nested_structs_types/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable_w_nested_structs_types/out.sql new file mode 100644 index 0000000000..75c4a86e18 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable_w_nested_structs_types/out.sql @@ -0,0 +1,11 @@ +WITH `bfcte_0` AS ( + SELECT + `id` AS `bfcol_0`, + `people` AS `bfcol_1` + FROM `bigframes-dev`.`sqlglot_test`.`nested_structs_types` +) +SELECT + `bfcol_0` AS `id`, + `bfcol_0` AS `id_1`, + `bfcol_1` AS `people` +FROM `bfcte_0` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable_w_repeated_types/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable_w_repeated_types/out.sql new file mode 100644 index 0000000000..2436c01a44 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable_w_repeated_types/out.sql @@ -0,0 +1,23 @@ +WITH `bfcte_0` AS ( + SELECT + `rowindex` AS `bfcol_0`, + `int_list_col` AS `bfcol_1`, + `bool_list_col` AS `bfcol_2`, + `float_list_col` AS `bfcol_3`, + `date_list_col` AS `bfcol_4`, + `date_time_list_col` AS `bfcol_5`, + `numeric_list_col` AS `bfcol_6`, + `string_list_col` AS `bfcol_7` + FROM `bigframes-dev`.`sqlglot_test`.`repeated_types` +) +SELECT + `bfcol_0` AS `rowindex`, + `bfcol_0` AS `rowindex_1`, + `bfcol_1` AS `int_list_col`, + `bfcol_2` AS `bool_list_col`, + `bfcol_3` AS `float_list_col`, + `bfcol_4` AS `date_list_col`, + `bfcol_5` AS `date_time_list_col`, + `bfcol_6` AS `numeric_list_col`, + `bfcol_7` AS `string_list_col` +FROM `bfcte_0` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/test_compile_readlocal.py b/tests/unit/core/compile/sqlglot/test_compile_readlocal.py index 58587da129..bd27ad450e 100644 --- a/tests/unit/core/compile/sqlglot/test_compile_readlocal.py +++ b/tests/unit/core/compile/sqlglot/test_compile_readlocal.py @@ -30,26 +30,31 @@ def test_compile_readlocal( def test_compile_readlocal_w_structs_df( nested_structs_pandas_df: pd.DataFrame, - compiler_session: bigframes.Session, + compiler_session_w_nested_structs_types: bigframes.Session, snapshot, ): - bf_df = bpd.DataFrame(nested_structs_pandas_df, session=compiler_session) + # TODO(b/427306734): Check why the output is different from the expected output. + bf_df = bpd.DataFrame( + nested_structs_pandas_df, session=compiler_session_w_nested_structs_types + ) snapshot.assert_match(bf_df.sql, "out.sql") def test_compile_readlocal_w_lists_df( - repeated_pandas_df: pd.DataFrame, - compiler_session: bigframes.Session, + repeated_types_pandas_df: pd.DataFrame, + compiler_session_w_repeated_types: bigframes.Session, snapshot, ): - bf_df = bpd.DataFrame(repeated_pandas_df, session=compiler_session) + bf_df = bpd.DataFrame( + repeated_types_pandas_df, session=compiler_session_w_repeated_types + ) snapshot.assert_match(bf_df.sql, "out.sql") def test_compile_readlocal_w_json_df( json_pandas_df: pd.DataFrame, - compiler_session: bigframes.Session, + compiler_session_w_json_types: bigframes.Session, snapshot, ): - bf_df = bpd.DataFrame(json_pandas_df, session=compiler_session) + bf_df = bpd.DataFrame(json_pandas_df, session=compiler_session_w_json_types) snapshot.assert_match(bf_df.sql, "out.sql") diff --git a/tests/unit/core/compile/sqlglot/test_compile_readtable.py b/tests/unit/core/compile/sqlglot/test_compile_readtable.py index 63849f093c..d3b5140471 100644 --- a/tests/unit/core/compile/sqlglot/test_compile_readtable.py +++ b/tests/unit/core/compile/sqlglot/test_compile_readtable.py @@ -23,6 +23,20 @@ def test_compile_readtable(scalars_types_df: bpd.DataFrame, snapshot): snapshot.assert_match(scalars_types_df.sql, "out.sql") +def test_compile_readtable_w_repeated_types(repeated_types_df: bpd.DataFrame, snapshot): + snapshot.assert_match(repeated_types_df.sql, "out.sql") + + +def test_compile_readtable_w_nested_structs_types( + nested_structs_types_df: bpd.DataFrame, snapshot +): + snapshot.assert_match(nested_structs_types_df.sql, "out.sql") + + +def test_compile_readtable_w_json_types(json_types_df: bpd.DataFrame, snapshot): + snapshot.assert_match(json_types_df.sql, "out.sql") + + def test_compile_readtable_w_ordering(scalars_types_df: bpd.DataFrame, snapshot): bf_df = scalars_types_df[["int64_col"]] bf_df = bf_df.sort_values("int64_col") diff --git a/third_party/bigframes_vendored/google_cloud_bigquery/_pandas_helpers.py b/third_party/bigframes_vendored/google_cloud_bigquery/_pandas_helpers.py index 5e2a7a7ef0..3e35b1382e 100644 --- a/third_party/bigframes_vendored/google_cloud_bigquery/_pandas_helpers.py +++ b/third_party/bigframes_vendored/google_cloud_bigquery/_pandas_helpers.py @@ -17,6 +17,7 @@ import warnings +import db_dtypes import google.cloud.bigquery.schema as schema import pyarrow @@ -61,6 +62,7 @@ def pyarrow_timestamp(): "TIME": pyarrow_time, "TIMESTAMP": pyarrow_timestamp, "BIGNUMERIC": pyarrow_bignumeric, + "JSON": db_dtypes.JSONArrowType, } ARROW_SCALAR_IDS_TO_BQ = { # https://arrow.apache.org/docs/python/api/datatypes.html#type-classes From 15e1277b1413de18a5e36f72959a99701d6df08b Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Tue, 24 Jun 2025 12:00:17 -0700 Subject: [PATCH 05/28] docs: add data visualization samples for public doc (#1847) --- samples/snippets/data_visualization_test.py | 149 ++++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 samples/snippets/data_visualization_test.py diff --git a/samples/snippets/data_visualization_test.py b/samples/snippets/data_visualization_test.py new file mode 100644 index 0000000000..64cbbe0511 --- /dev/null +++ b/samples/snippets/data_visualization_test.py @@ -0,0 +1,149 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (t +# you may not use this file except in compliance wi +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in +# distributed under the License is distributed on a +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, eit +# See the License for the specific language governi +# limitations under the License. + + +def test_data_visualization() -> None: + # [START bigquery_dataframes_data_visualization_penguin_histogram] + import bigframes.pandas as bpd + + penguins = bpd.read_gbq("bigquery-public-data.ml_datasets.penguins") + penguins["culmen_depth_mm"].plot.hist(bins=40) + # [END bigquery_dataframes_data_visualization_penguin_histogram] + + # [START bigquery_dataframes_data_visualization_noaa_line_chart] + import bigframes.pandas as bpd + + noaa_surface = bpd.read_gbq("bigquery-public-data.noaa_gsod.gsod2021") + + # Calculate median temperature for each day + noaa_surface_median_temps = noaa_surface[["date", "temp"]].groupby("date").median() + + noaa_surface_median_temps.plot.line() + # [END bigquery_dataframes_data_visualization_noaa_line_chart] + + # [START bigquery_dataframes_data_visualization_usa_names_area_chart] + import bigframes.pandas as bpd + + usa_names = bpd.read_gbq("bigquery-public-data.usa_names.usa_1910_2013") + + # Count the occurences of the target names each year. The result is a dataframe with a multi-index. + name_counts = ( + usa_names[usa_names["name"].isin(("Mary", "Emily", "Lisa"))] + .groupby(("year", "name"))["number"] + .sum() + ) + + # Flatten the index of the dataframe so that the counts for each name has their own columns. + name_counts = name_counts.unstack(level=1).fillna(0) + + name_counts.plot.area(stacked=False, alpha=0.5) + # [END bigquery_dataframes_data_visualization_usa_names_area_chart] + + # [START bigquery_dataframes_data_visualization_penguin_bar_chart] + import bigframes.pandas as bpd + + penguins = bpd.read_gbq("bigquery-public-data.ml_datasets.penguins") + + penguin_count_by_sex = ( + penguins[penguins["sex"].isin(("MALE", "FEMALE"))] + .groupby("sex")["species"] + .count() + ) + penguin_count_by_sex.plot.bar() + # [END bigquery_dataframes_data_visualization_penguin_bar_chart] + + # [START bigquery_dataframes_data_visualization_taxi_scatter_plot] + import bigframes.pandas as bpd + + taxi_trips = bpd.read_gbq( + "bigquery-public-data.new_york_taxi_trips.tlc_yellow_trips_2021" + ).dropna() + + # Data Cleaning + taxi_trips = taxi_trips[ + taxi_trips["trip_distance"].between(0, 10, inclusive="right") + ] + taxi_trips = taxi_trips[taxi_trips["fare_amount"].between(0, 50, inclusive="right")] + + # If you are using partial ordering mode, you will also need to assign an order to your dataset. + # Otherwise, the next line can be skipped. + taxi_trips = taxi_trips.sort_values("pickup_datetime") + + taxi_trips.plot.scatter(x="trip_distance", y="fare_amount", alpha=0.5) + # [END bigquery_dataframes_data_visualization_taxi_scatter_plot] + + # [START bigquery_dataframes_data_visualization_noaa_sampling_n] + import bigframes.pandas as bpd + + noaa_surface = bpd.read_gbq("bigquery-public-data.noaa_gsod.gsod2021") + + # Calculate median temperature for each day + noaa_surface_median_temps = noaa_surface[["date", "temp"]].groupby("date").median() + + noaa_surface_median_temps.plot.line(sampling_n=40) + # [END bigquery_dataframes_data_visualization_noaa_sampling_n] + + # [START bigquery_dataframes_data_visualization_usa_names_subplots] + import bigframes.pandas as bpd + + usa_names = bpd.read_gbq("bigquery-public-data.usa_names.usa_1910_2013") + + # Count the occurences of the target names each year. The result is a dataframe with a multi-index. + name_counts = ( + usa_names[usa_names["name"].isin(("Mary", "Emily", "Lisa"))] + .groupby(("year", "name"))["number"] + .sum() + ) + + # Flatten the index of the dataframe so that the counts for each name has their own columns. + name_counts = name_counts.unstack(level=1).fillna(0) + + name_counts.plot.area(subplots=True, alpha=0.5) + # [END bigquery_dataframes_data_visualization_usa_names_subplots] + + # [START bigquery_dataframes_data_visualization_taxi_scatter_multidimension] + import bigframes.pandas as bpd + + taxi_trips = bpd.read_gbq( + "bigquery-public-data.new_york_taxi_trips.tlc_yellow_trips_2021" + ).dropna() + + # Data Cleaning + taxi_trips = taxi_trips[ + taxi_trips["trip_distance"].between(0, 10, inclusive="right") + ] + taxi_trips = taxi_trips[taxi_trips["fare_amount"].between(0, 50, inclusive="right")] + + # If you are using partial ordering mode, you also need to assign an order to your dataset. + # Otherwise, the next line can be skipped. + taxi_trips = taxi_trips.sort_values("pickup_datetime") + + taxi_trips["passenger_count_scaled"] = taxi_trips["passenger_count"] * 30 + + taxi_trips.plot.scatter( + x="trip_distance", + xlabel="trip distance (miles)", + y="fare_amount", + ylabel="fare amount (usd)", + alpha=0.5, + s="passenger_count_scaled", + label="passenger_count", + c="tip_amount", + cmap="jet", + colorbar=True, + legend=True, + figsize=(15, 7), + sampling_n=1000, + ) + # [END bigquery_dataframes_data_visualization_taxi_scatter_multidimension] From c06d8db9b5e8372c0c5e932181e1a09f7158ac44 Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Tue, 24 Jun 2025 14:19:09 -0700 Subject: [PATCH 06/28] chore!: remove ai.top_k(). (#1842) * chore: remove ai.top_k(). * remove redundant import --- bigframes/operations/ai.py | 203 ---------------------- notebooks/experimental/ai_operators.ipynb | 123 ------------- tests/system/large/operations/test_ai.py | 59 ------- tests/system/small/operations/test_ai.py | 24 --- 4 files changed, 409 deletions(-) diff --git a/bigframes/operations/ai.py b/bigframes/operations/ai.py index 10c842c64c..8c7628059a 100644 --- a/bigframes/operations/ai.py +++ b/bigframes/operations/ai.py @@ -19,8 +19,6 @@ from typing import Dict, Iterable, List, Optional, Sequence, Union import warnings -import numpy as np - from bigframes import dtypes, exceptions, options from bigframes.core import guid, log_adapter @@ -586,207 +584,6 @@ def search( return typing.cast(bigframes.dataframe.DataFrame, search_result) - def top_k( - self, - instruction: str, - model, - k: int = 10, - ground_with_google_search: bool = False, - ): - """ - Ranks each tuple and returns the k best according to the instruction. - - This method employs a quick select algorithm to efficiently compare the pivot - with all other items. By leveraging an LLM (Large Language Model), it then - identifies the top 'k' best answers from these comparisons. - - **Examples:** - - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> bpd.options.experiments.ai_operators = True - >>> bpd.options.compute.ai_ops_confirmation_threshold = 25 - - >>> import bigframes.ml.llm as llm - >>> model = llm.GeminiTextGenerator(model_name="gemini-2.0-flash-001") - - >>> df = bpd.DataFrame( - ... { - ... "Animals": ["Dog", "Bird", "Cat", "Horse"], - ... "Sounds": ["Woof", "Chirp", "Meow", "Neigh"], - ... }) - >>> df.ai.top_k("{Animals} are more popular as pets", model=model, k=2) - Animals Sounds - 0 Dog Woof - 2 Cat Meow - - [2 rows x 2 columns] - - Args: - instruction (str): - An instruction on how to map the data. This value must contain - column references by name enclosed in braces. - For example, to reference a column named "Animals", use "{Animals}" in the - instruction, like: "{Animals} are more popular as pets" - - model (bigframes.ml.llm.GeminiTextGenerator): - A GeminiTextGenerator provided by the Bigframes ML package. - - k (int, default 10): - The number of rows to return. - - ground_with_google_search (bool, default False): - Enables Grounding with Google Search for the GeminiTextGenerator model. - When set to True, the model incorporates relevant information from Google - Search results into its responses, enhancing their accuracy and factualness. - Note: Using this feature may impact billing costs. Refer to the pricing - page for details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models - The default is `False`. - - Returns: - bigframes.dataframe.DataFrame: A new DataFrame with the top k rows. - - Raises: - NotImplementedError: when the AI operator experiment is off. - ValueError: when the instruction refers to a non-existing column, or when no - columns are referred to. - """ - if not options.experiments.ai_operators: - raise NotImplementedError() - - import bigframes.dataframe - import bigframes.series - - self._validate_model(model) - columns = self._parse_columns(instruction) - for column in columns: - if column not in self._df.columns: - raise ValueError(f"Column {column} not found.") - if len(columns) > 1: - raise NotImplementedError("AI top K are limited to a single column.") - - if ground_with_google_search: - msg = exceptions.format_message( - "Enables Grounding with Google Search may impact billing cost. See pricing " - "details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models" - ) - warnings.warn(msg, category=UserWarning) - - work_estimate = int(len(self._df) * (len(self._df) - 1) / 2) - self._confirm_operation(work_estimate) - - df: bigframes.dataframe.DataFrame = self._df[columns].copy() - column = columns[0] - if df[column].dtype != dtypes.STRING_DTYPE: - df[column] = df[column].astype(dtypes.STRING_DTYPE) - - # `index` is reserved for the `reset_index` below. - if column == "index": - raise ValueError( - "Column name 'index' is reserved. Please choose a different name." - ) - - if k < 1: - raise ValueError("k must be an integer greater than or equal to 1.") - - user_instruction = self._format_instruction(instruction, columns) - - n = df.shape[0] - if k >= n: - return df - - # Create a unique index and duplicate it as the "index" column. This workaround - # is needed for the select search algorithm due to unimplemented bigFrame methods. - df = df.reset_index().rename(columns={"index": "old_index"}).reset_index() - - # Initialize a status column to track the selection status of each item. - # - None: Unknown/not yet processed - # - 1.0: Selected as part of the top-k items - # - -1.0: Excluded from the top-k items - status_column = guid.generate_guid("status") - df[status_column] = bigframes.series.Series( - None, dtype=dtypes.FLOAT_DTYPE, session=df._session - ) - - num_selected = 0 - while num_selected < k: - df, num_new_selected = self._topk_partition( - df, - column, - status_column, - user_instruction, - model, - k - num_selected, - ground_with_google_search, - ) - num_selected += num_new_selected - - result_df: bigframes.dataframe.DataFrame = self._df.copy() - return result_df[df.set_index("old_index")[status_column] > 0.0] - - @staticmethod - def _topk_partition( - df, - column: str, - status_column: str, - user_instruction: str, - model, - k: int, - ground_with_google_search: bool, - ): - output_instruction = ( - "Given a question and two documents, choose the document that best answers " - "the question. Respond with 'Document 1' or 'Document 2'. You must choose " - "one, even if neither is ideal. " - ) - - # Random pivot selection for improved average quickselect performance. - pending_df = df[df[status_column].isna()] - pivot_iloc = np.random.randint(0, pending_df.shape[0]) - pivot_index = pending_df.iloc[pivot_iloc]["index"] - pivot_df = pending_df[pending_df["index"] == pivot_index] - - # Build a prompt to compare the pivot item's relevance to other pending items. - prompt_s = pending_df[pending_df["index"] != pivot_index][column] - prompt_s = ( - f"{output_instruction}\n\nQuestion: {user_instruction}\n" - + f"\nDocument 1: {column} " - + pivot_df.iloc[0][column] - + f"\nDocument 2: {column} " - + prompt_s # type:ignore - ) - - import bigframes.dataframe - - predict_df = typing.cast( - bigframes.dataframe.DataFrame, - model.predict( - prompt_s, - temperature=0.0, - ground_with_google_search=ground_with_google_search, - ), - ) - - marks = predict_df["ml_generate_text_llm_result"].str.contains("2") - more_relavant: bigframes.dataframe.DataFrame = df[marks] - less_relavent: bigframes.dataframe.DataFrame = df[~marks] - - num_more_relavant = more_relavant.shape[0] - if k < num_more_relavant: - less_relavent[status_column] = -1.0 - pivot_df[status_column] = -1.0 - df = df.combine_first(less_relavent).combine_first(pivot_df) - return df, 0 - else: # k >= num_more_relavant - more_relavant[status_column] = 1.0 - df = df.combine_first(more_relavant) - if k >= num_more_relavant + 1: - pivot_df[status_column] = 1.0 - df = df.combine_first(pivot_df) - return df, num_more_relavant + 1 - else: - return df, num_more_relavant - def sim_join( self, other, diff --git a/notebooks/experimental/ai_operators.ipynb b/notebooks/experimental/ai_operators.ipynb index f830787801..07e20f6bbd 100644 --- a/notebooks/experimental/ai_operators.ipynb +++ b/notebooks/experimental/ai_operators.ipynb @@ -1064,129 +1064,6 @@ "animals.ai.join(animals, \"{left.animal} generally weighs heavier than {right.animal}\", model=gemini_model)" ] }, - { - "cell_type": "markdown", - "metadata": { - "id": "kU7BsyTyiouX" - }, - "source": [ - "## AI Top K" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "s9QePXEoiouX" - }, - "source": [ - "AI Top K selects the top K values based on your instruction. Here is an example:" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": { - "id": "bMQqtyZ2iouX" - }, - "outputs": [], - "source": [ - "df = bpd.DataFrame({\"Animals\": [\"Corgi\", \"Orange Cat\", \"Parrot\", \"Tarantula\"]})" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "KiljGBSCiouX" - }, - "source": [ - "You want to find the top two most popular pets:" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 159 - }, - "id": "OZv5WUGIiouX", - "outputId": "ae1cee27-cc31-455e-c4ac-c0a9a5cf4ca5" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/sycai/src/python-bigquery-dataframes/bigframes/core/array_value.py:114: PreviewWarning: JSON column interpretation as a custom PyArrow extention in\n", - "`db_dtypes` is a preview feature and subject to change.\n", - " warnings.warn(msg, bfe.PreviewWarning)\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Animals
0Corgi
1Orange Cat
\n", - "

2 rows × 1 columns

\n", - "
[2 rows x 1 columns in total]" - ], - "text/plain": [ - " Animals\n", - "0 Corgi\n", - "1 Orange Cat\n", - "\n", - "[2 rows x 1 columns]" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.ai.top_k(\"{Animals} are more popular as pets\", model=gemini_model, k=2)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "dC8fyu3aiouX" - }, - "source": [ - "Under the hood, the AI top K operator performs pair-wise comparisons with LLM. The top K results are returned in the order of their indices instead of their ranks." - ] - }, { "cell_type": "markdown", "metadata": { diff --git a/tests/system/large/operations/test_ai.py b/tests/system/large/operations/test_ai.py index ded5e0b588..86b30d9c65 100644 --- a/tests/system/large/operations/test_ai.py +++ b/tests/system/large/operations/test_ai.py @@ -848,65 +848,6 @@ def test_sim_join_data_too_large_raises_error(session, text_embedding_generator) ) -@pytest.mark.parametrize( - "instruction", - [ - pytest.param( - "No column reference", - id="zero_column", - marks=pytest.mark.xfail(raises=ValueError), - ), - pytest.param( - "{Animals}", - id="non_existing_column", - marks=pytest.mark.xfail(raises=ValueError), - ), - pytest.param( - "{Animals} and {Animals}", - id="two_columns", - marks=pytest.mark.xfail(raises=NotImplementedError), - ), - pytest.param( - "{index}", - id="preserved", - marks=pytest.mark.xfail(raises=ValueError), - ), - ], -) -def test_top_k_invalid_instruction_raise_error(instruction, gemini_flash_model): - df = dataframe.DataFrame( - { - "Animals": ["Dog", "Cat", "Bird", "Horse"], - "ID": [1, 2, 3, 4], - "index": ["a", "b", "c", "d"], - } - ) - - with bigframes.option_context( - AI_OP_EXP_OPTION, - True, - THRESHOLD_OPTION, - 10, - ): - df.ai.top_k(instruction, model=gemini_flash_model, k=2) - - -def test_top_k_invalid_k_raise_error(gemini_flash_model): - df = dataframe.DataFrame({"Animals": ["Dog", "Cat", "Bird", "Horse"]}) - - with bigframes.option_context( - AI_OP_EXP_OPTION, - True, - THRESHOLD_OPTION, - 10, - ), pytest.raises(ValueError): - df.ai.top_k( - "{Animals} are more popular as pets", - gemini_flash_model, - k=0, - ) - - @patch("builtins.input", return_value="") def test_confirm_operation__below_threshold_do_not_confirm(mock_input): df = dataframe.DataFrame({}) diff --git a/tests/system/small/operations/test_ai.py b/tests/system/small/operations/test_ai.py index 771b7b47d3..d6ec3cacad 100644 --- a/tests/system/small/operations/test_ai.py +++ b/tests/system/small/operations/test_ai.py @@ -74,11 +74,6 @@ def predict(self, *args, **kwargs): {"search_column": None, "query": None, "top_k": None, "model": None}, id="search", ), - pytest.param( - bigframes.operations.ai.AIAccessor.top_k, - {"instruction": None, "model": None}, - id="top_k", - ), pytest.param( bigframes.operations.ai.AIAccessor.sim_join, {"other": None, "left_on": None, "right_on": None, "model": None}, @@ -247,25 +242,6 @@ def test_join(session): ) -def test_top_k(session): - df = dataframe.DataFrame({"col": ["A", "B"]}, session=session) - model = FakeGeminiTextGenerator( - dataframe.DataFrame( - {"ml_generate_text_llm_result": ["Document 1"]}, session=session - ), - ) - - with bigframes.option_context( - AI_OP_EXP_OPTION, - True, - THRESHOLD_OPTION, - 50, - ): - result = df.ai.top_k("top k of {col}", model, k=1).to_pandas() - - assert len(result) == 1 - - def test_forecast_default(time_series_df_default_index: dataframe.DataFrame): df = time_series_df_default_index[time_series_df_default_index["id"] == "1"] From c706759b85359b6d23ce3449f6ab138ad2d22f9d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Tue, 24 Jun 2025 17:01:01 -0500 Subject: [PATCH 07/28] feat: create `deploy_remote_function` and `deploy_udf` functions to immediately deploy functions to BigQuery (#1832) * Refactor function deployment to avoid code duplication This commit refactors the implementation of immediate deployment for remote functions and UDFs to eliminate code duplication introduced in a previous commit. Changes: - The `remote_function` and `udf` methods in `bigframes.functions._function_session.FunctionSession` now accept an optional `deploy_immediately: bool` parameter (defaulting to `False`). The previous `deploy_remote_function` and `deploy_udf` methods in `FunctionSession` have been removed, and their logic is now incorporated into the unified methods. - The public API functions `bigframes.pandas.deploy_remote_function` and `bigframes.pandas.deploy_udf` now call the corresponding `FunctionSession` methods with `deploy_immediately=True`. - The public API functions `bigframes.pandas.remote_function` and `bigframes.pandas.udf` call the `FunctionSession` methods with `deploy_immediately=False` (relying on the default). - Unit tests in `tests/unit/functions/test_remote_function.py` have been updated to patch the unified `FunctionSession` methods and verify the correct `deploy_immediately` boolean is passed based on which public API function is called. Note: The underlying provisioning logic in `FunctionSession` currently deploys functions immediately regardless of the `deploy_immediately` flag. This flag serves as an indicator of intent and allows for future enhancements to support true lazy deployment if desired, without further API changes. * Refactor function deployment to use distinct methods This commit corrects a previous refactoring attempt to eliminate code duplication and properly separates immediate-deployment functions from standard (potentially lazy) functions. Changes: - `bigframes.functions._function_session.FunctionSession` now has distinct methods: `remote_function`, `udf`, `deploy_remote_function`, and `deploy_udf`. The `deploy_immediately` flag has been removed from this class. - `deploy_remote_function` and `deploy_udf` methods in `FunctionSession` are responsible for ensuring immediate deployment by calling the underlying provisioning logic directly. The standard `remote_function` and `udf` methods in `FunctionSession` also currently call this provisioning logic, meaning all functions are deployed immediately as of now, but the structure allows for future lazy evaluation for standard functions without changing the deploy variants' contract. - Public API functions in `bigframes.pandas` (`remote_function`, `udf`, `deploy_remote_function`, `deploy_udf`) now correctly delegate to their corresponding distinct methods in `FunctionSession` (via the `Session` object). - Unit tests in `tests/unit/functions/test_remote_function.py` have been updated to mock and verify calls to the correct distinct methods on `bigframes.session.Session`. This resolves the issue of using a boolean flag to control deployment type and instead relies on calling specific, dedicated methods for immediate deployment, aligning with your request. * Simplify internal deploy_remote_function and deploy_udf calls This commit simplifies the implementation of `deploy_remote_function` and `deploy_udf` within `bigframes.functions._function_session.FunctionSession`. Given that the standard `remote_function` and `udf` methods in `FunctionSession` already perform immediate deployment of resources (as the underlying provisioning logic they call is immediate), the `deploy_remote_function` and `deploy_udf` methods in the same class are simplified to directly call `self.remote_function(...)` and `self.udf(...)` respectively. This change makes the distinction between the `deploy_` variants and the standard variants in `FunctionSession` primarily a matter of semantic clarity and intent at that level; both paths currently result in immediate deployment. The public API in `bigframes.pandas` continues to offer distinct `deploy_` functions that call these `FunctionSession.deploy_` methods, preserving your user-facing API and its documented behavior of immediate deployment. No changes were needed for the public API in `bigframes.pandas` or the unit tests, as they were already aligned with calling distinct methods on the `Session` object, which in turn calls the now-simplified `FunctionSession` methods. * add tests and use kwargs * add missing func argument to bpd --------- Co-authored-by: google-labs-jules[bot] <161369871+google-labs-jules[bot]@users.noreply.github.com> --- bigframes/core/global_session.py | 4 +- bigframes/functions/_function_session.py | 50 +++++++++++++ bigframes/pandas/__init__.py | 32 ++++++++ bigframes/session/__init__.py | 78 +++++++++++++++++++- tests/unit/functions/test_remote_function.py | 54 ++++++++++++++ 5 files changed, 214 insertions(+), 4 deletions(-) diff --git a/bigframes/core/global_session.py b/bigframes/core/global_session.py index 8732b55990..4698e4c4c5 100644 --- a/bigframes/core/global_session.py +++ b/bigframes/core/global_session.py @@ -110,8 +110,8 @@ def get_global_session(): _T = TypeVar("_T") -def with_default_session(func: Callable[..., _T], *args, **kwargs) -> _T: - return func(get_global_session(), *args, **kwargs) +def with_default_session(func_: Callable[..., _T], *args, **kwargs) -> _T: + return func_(get_global_session(), *args, **kwargs) class _GlobalSessionContext: diff --git a/bigframes/functions/_function_session.py b/bigframes/functions/_function_session.py index 9e7555431a..a7910127e4 100644 --- a/bigframes/functions/_function_session.py +++ b/bigframes/functions/_function_session.py @@ -668,6 +668,30 @@ def wrapper(func): return wrapper + def deploy_remote_function( + self, + func, + **kwargs, + ): + """Orchestrates the creation of a BigQuery remote function that deploys immediately. + + This method ensures that the remote function is created and available for + use in BigQuery as soon as this call is made. + + Args: + kwargs: + All arguments are passed directly to + :meth:`~bigframes.session.Session.remote_function`. Please see + its docstring for parameter details. + + Returns: + A wrapped remote function, usable in + :meth:`~bigframes.series.Series.apply`. + """ + # TODO(tswast): If we update remote_function to defer deployment, update + # this method to deploy immediately. + return self.remote_function(**kwargs)(func) + def udf( self, input_types: Union[None, type, Sequence[type]] = None, @@ -866,6 +890,32 @@ def wrapper(func): return wrapper + def deploy_udf( + self, + func, + **kwargs, + ): + """Orchestrates the creation of a BigQuery UDF that deploys immediately. + + This method ensures that the UDF is created and available for + use in BigQuery as soon as this call is made. + + Args: + func: + Function to deploy. + kwargs: + All arguments are passed directly to + :meth:`~bigframes.session.Session.udf`. Please see + its docstring for parameter details. + + Returns: + A wrapped Python user defined function, usable in + :meth:`~bigframes.series.Series.apply`. + """ + # TODO(tswast): If we update udf to defer deployment, update this method + # to deploy immediately. + return self.udf(**kwargs)(func) + def _convert_row_processor_sig( signature: inspect.Signature, diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index a9d1c31865..ed999e62c1 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -117,6 +117,22 @@ def remote_function( remote_function.__doc__ = inspect.getdoc(bigframes.session.Session.remote_function) +def deploy_remote_function( + func, + **kwargs, +): + return global_session.with_default_session( + bigframes.session.Session.deploy_remote_function, + func=func, + **kwargs, + ) + + +deploy_remote_function.__doc__ = inspect.getdoc( + bigframes.session.Session.deploy_remote_function +) + + def udf( *, input_types: Union[None, type, Sequence[type]] = None, @@ -140,6 +156,20 @@ def udf( udf.__doc__ = inspect.getdoc(bigframes.session.Session.udf) +def deploy_udf( + func, + **kwargs, +): + return global_session.with_default_session( + bigframes.session.Session.deploy_udf, + func=func, + **kwargs, + ) + + +deploy_udf.__doc__ = inspect.getdoc(bigframes.session.Session.deploy_udf) + + @typing.overload def to_datetime( arg: Union[ @@ -330,6 +360,8 @@ def reset_session(): clean_up_by_session_id, concat, cut, + deploy_remote_function, + deploy_udf, get_default_session_id, get_dummies, merge, diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index c06233bad3..13db6823c1 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -1343,6 +1343,40 @@ def _check_file_size(self, filepath: str): "for large files to avoid loading the file into local memory." ) + def deploy_remote_function( + self, + func, + **kwargs, + ): + """Orchestrates the creation of a BigQuery remote function that deploys immediately. + + This method ensures that the remote function is created and available for + use in BigQuery as soon as this call is made. + + Args: + func: + Function to deploy. + kwargs: + All arguments are passed directly to + :meth:`~bigframes.session.Session.remote_function`. Please see + its docstring for parameter details. + + Returns: + A wrapped remote function, usable in + :meth:`~bigframes.series.Series.apply`. + """ + return self._function_session.deploy_remote_function( + func, + # Session-provided arguments. + session=self, + bigquery_client=self._clients_provider.bqclient, + bigquery_connection_client=self._clients_provider.bqconnectionclient, + cloud_functions_client=self._clients_provider.cloudfunctionsclient, + resource_manager_client=self._clients_provider.resourcemanagerclient, + # User-provided arguments. + **kwargs, + ) + def remote_function( self, # Make sure that the input/output types, and dataset can be used @@ -1565,9 +1599,15 @@ def remote_function( `bigframes_remote_function` - The bigquery remote function capable of calling into `bigframes_cloud_function`. """ return self._function_session.remote_function( + # Session-provided arguments. + session=self, + bigquery_client=self._clients_provider.bqclient, + bigquery_connection_client=self._clients_provider.bqconnectionclient, + cloud_functions_client=self._clients_provider.cloudfunctionsclient, + resource_manager_client=self._clients_provider.resourcemanagerclient, + # User-provided arguments. input_types=input_types, output_type=output_type, - session=self, dataset=dataset, bigquery_connection=bigquery_connection, reuse=reuse, @@ -1585,6 +1625,37 @@ def remote_function( cloud_build_service_account=cloud_build_service_account, ) + def deploy_udf( + self, + func, + **kwargs, + ): + """Orchestrates the creation of a BigQuery UDF that deploys immediately. + + This method ensures that the UDF is created and available for + use in BigQuery as soon as this call is made. + + Args: + func: + Function to deploy. + kwargs: + All arguments are passed directly to + :meth:`~bigframes.session.Session.udf`. Please see + its docstring for parameter details. + + Returns: + A wrapped Python user defined function, usable in + :meth:`~bigframes.series.Series.apply`. + """ + return self._function_session.deploy_udf( + func, + # Session-provided arguments. + session=self, + bigquery_client=self._clients_provider.bqclient, + # User-provided arguments. + **kwargs, + ) + def udf( self, *, @@ -1726,9 +1797,12 @@ def udf( deployed for the user defined code. """ return self._function_session.udf( + # Session-provided arguments. + session=self, + bigquery_client=self._clients_provider.bqclient, + # User-provided arguments. input_types=input_types, output_type=output_type, - session=self, dataset=dataset, bigquery_connection=bigquery_connection, name=name, diff --git a/tests/unit/functions/test_remote_function.py b/tests/unit/functions/test_remote_function.py index 978281e5c9..ea09ac59d3 100644 --- a/tests/unit/functions/test_remote_function.py +++ b/tests/unit/functions/test_remote_function.py @@ -89,3 +89,57 @@ def function_without_return_annotation(myparam: int): match="'output_type' was not set .* missing a return type annotation", ): remote_function_decorator(function_without_return_annotation) + + +def test_deploy_remote_function(): + session = mocks.create_bigquery_session() + + def my_remote_func(x: int) -> int: + return x * 2 + + deployed = session.deploy_remote_function( + my_remote_func, cloud_function_service_account="test_sa@example.com" + ) + + # Test that the function would have been deployed somewhere. + assert deployed.bigframes_bigquery_function + + +def test_deploy_remote_function_with_name(): + session = mocks.create_bigquery_session() + + def my_remote_func(x: int) -> int: + return x * 2 + + deployed = session.deploy_remote_function( + my_remote_func, + name="my_custom_name", + cloud_function_service_account="test_sa@example.com", + ) + + # Test that the function would have been deployed somewhere. + assert "my_custom_name" in deployed.bigframes_bigquery_function + + +def test_deploy_udf(): + session = mocks.create_bigquery_session() + + def my_remote_func(x: int) -> int: + return x * 2 + + deployed = session.deploy_udf(my_remote_func) + + # Test that the function would have been deployed somewhere. + assert deployed.bigframes_bigquery_function + + +def test_deploy_udf_with_name(): + session = mocks.create_bigquery_session() + + def my_remote_func(x: int) -> int: + return x * 2 + + deployed = session.deploy_udf(my_remote_func, name="my_custom_name") + + # Test that the function would have been deployed somewhere. + assert "my_custom_name" in deployed.bigframes_bigquery_function From 0709f17ed01839f34cce272eeed5d745daa9e30f Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Tue, 24 Jun 2025 15:34:13 -0700 Subject: [PATCH 08/28] refactor: provide infrastructure for SQLGlot scalar compiler (#1850) * refactor: provide infrastructure for SQLGlot scalar compiler * remove redundant code * remove redundant code * add TODO back --- .../compile/sqlglot/expressions/__init__.py | 13 +++++ .../sqlglot/expressions/binary_compiler.py | 49 +++++++++++++++++ .../sqlglot/expressions/nary_compiler.py | 32 +++++++++++ .../sqlglot/expressions/op_registration.py | 44 +++++++++++++++ .../sqlglot/expressions/ternary_compiler.py | 35 ++++++++++++ .../compile/sqlglot/expressions/typed_expr.py | 27 ++++++++++ .../sqlglot/expressions/unary_compiler.py | 31 +++++++++++ .../core/compile/sqlglot/scalar_compiler.py | 54 +++++-------------- 8 files changed, 244 insertions(+), 41 deletions(-) create mode 100644 bigframes/core/compile/sqlglot/expressions/__init__.py create mode 100644 bigframes/core/compile/sqlglot/expressions/binary_compiler.py create mode 100644 bigframes/core/compile/sqlglot/expressions/nary_compiler.py create mode 100644 bigframes/core/compile/sqlglot/expressions/op_registration.py create mode 100644 bigframes/core/compile/sqlglot/expressions/ternary_compiler.py create mode 100644 bigframes/core/compile/sqlglot/expressions/typed_expr.py create mode 100644 bigframes/core/compile/sqlglot/expressions/unary_compiler.py diff --git a/bigframes/core/compile/sqlglot/expressions/__init__.py b/bigframes/core/compile/sqlglot/expressions/__init__.py new file mode 100644 index 0000000000..0a2669d7a2 --- /dev/null +++ b/bigframes/core/compile/sqlglot/expressions/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/bigframes/core/compile/sqlglot/expressions/binary_compiler.py b/bigframes/core/compile/sqlglot/expressions/binary_compiler.py new file mode 100644 index 0000000000..447b5af860 --- /dev/null +++ b/bigframes/core/compile/sqlglot/expressions/binary_compiler.py @@ -0,0 +1,49 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import typing + +import sqlglot.expressions as sge + +from bigframes import dtypes +from bigframes import operations as ops +from bigframes.core.compile.sqlglot.expressions.op_registration import OpRegistration +from bigframes.core.compile.sqlglot.expressions.typed_expr import TypedExpr + +BinaryOpCompiler = typing.Callable[[ops.BinaryOp, TypedExpr, TypedExpr], sge.Expression] + +BINARY_OP_REIGSTRATION = OpRegistration[BinaryOpCompiler]() + + +def compile(op: ops.BinaryOp, left: TypedExpr, right: TypedExpr) -> sge.Expression: + return BINARY_OP_REIGSTRATION[op](op, left, right) + + +# TODO: add parenthesize for operators +@BINARY_OP_REIGSTRATION.register(ops.add_op) +def _(op, left: TypedExpr, right: TypedExpr) -> sge.Expression: + if left.dtype == dtypes.STRING_DTYPE and right.dtype == dtypes.STRING_DTYPE: + # String addition + return sge.Concat(expressions=[left.expr, right.expr]) + + # Numerical addition + return sge.Add(this=left.expr, expression=right.expr) + + +@BINARY_OP_REIGSTRATION.register(ops.ge_op) +def compile_ge(op, left: TypedExpr, right: TypedExpr) -> sge.Expression: + + return sge.GTE(this=left.expr, expression=right.expr) diff --git a/bigframes/core/compile/sqlglot/expressions/nary_compiler.py b/bigframes/core/compile/sqlglot/expressions/nary_compiler.py new file mode 100644 index 0000000000..d470009c2c --- /dev/null +++ b/bigframes/core/compile/sqlglot/expressions/nary_compiler.py @@ -0,0 +1,32 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import typing + +import sqlglot.expressions as sge + +from bigframes import operations as ops +from bigframes.core.compile.sqlglot.expressions.op_registration import OpRegistration +from bigframes.core.compile.sqlglot.expressions.typed_expr import TypedExpr + +# No simpler way to specify that the compilation function expects varargs. +NaryOpCompiler = typing.Callable[..., sge.Expression] + +NARY_OP_REIGSTRATION = OpRegistration[NaryOpCompiler]() + + +def compile(op: ops.NaryOp, *args: TypedExpr) -> sge.Expression: + return NARY_OP_REIGSTRATION[op](op, *args) diff --git a/bigframes/core/compile/sqlglot/expressions/op_registration.py b/bigframes/core/compile/sqlglot/expressions/op_registration.py new file mode 100644 index 0000000000..c2290e7c93 --- /dev/null +++ b/bigframes/core/compile/sqlglot/expressions/op_registration.py @@ -0,0 +1,44 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import typing +from typing import Generic, TypeVar + +from bigframes import operations as ops + +T = TypeVar("T") + + +class OpRegistration(Generic[T]): + _registered_ops: dict[str, T] = {} + + def register( + self, op: ops.ScalarOp | type[ops.ScalarOp] + ) -> typing.Callable[[T], T]: + key = typing.cast(str, op.name) + + def decorator(item: T): + if key in self._registered_ops: + raise ValueError(f"{key} is already registered") + self._registered_ops[key] = item + return item + + return decorator + + def __getitem__(self, key: str | ops.ScalarOp) -> T: + if isinstance(key, ops.ScalarOp): + return self._registered_ops[key.name] + return self._registered_ops[key] diff --git a/bigframes/core/compile/sqlglot/expressions/ternary_compiler.py b/bigframes/core/compile/sqlglot/expressions/ternary_compiler.py new file mode 100644 index 0000000000..ee33b48c17 --- /dev/null +++ b/bigframes/core/compile/sqlglot/expressions/ternary_compiler.py @@ -0,0 +1,35 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import typing + +import sqlglot.expressions as sge + +from bigframes import operations as ops +from bigframes.core.compile.sqlglot.expressions.op_registration import OpRegistration +from bigframes.core.compile.sqlglot.expressions.typed_expr import TypedExpr + +TernaryOpCompiler = typing.Callable[ + [ops.TernaryOp, TypedExpr, TypedExpr, TypedExpr], sge.Expression +] + +TERNATRY_OP_REIGSTRATION = OpRegistration[TernaryOpCompiler]() + + +def compile( + op: ops.TernaryOp, expr1: TypedExpr, expr2: TypedExpr, expr3: TypedExpr +) -> sge.Expression: + return TERNATRY_OP_REIGSTRATION[op](op, expr1, expr2, expr3) diff --git a/bigframes/core/compile/sqlglot/expressions/typed_expr.py b/bigframes/core/compile/sqlglot/expressions/typed_expr.py new file mode 100644 index 0000000000..e693dd94a2 --- /dev/null +++ b/bigframes/core/compile/sqlglot/expressions/typed_expr.py @@ -0,0 +1,27 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import dataclasses + +import sqlglot.expressions as sge + +from bigframes import dtypes + + +@dataclasses.dataclass(frozen=True) +class TypedExpr: + """SQLGlot expression with type.""" + + expr: sge.Expression + dtype: dtypes.ExpressionType diff --git a/bigframes/core/compile/sqlglot/expressions/unary_compiler.py b/bigframes/core/compile/sqlglot/expressions/unary_compiler.py new file mode 100644 index 0000000000..56951a58dd --- /dev/null +++ b/bigframes/core/compile/sqlglot/expressions/unary_compiler.py @@ -0,0 +1,31 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import typing + +import sqlglot.expressions as sge + +from bigframes import operations as ops +from bigframes.core.compile.sqlglot.expressions.op_registration import OpRegistration +from bigframes.core.compile.sqlglot.expressions.typed_expr import TypedExpr + +UnaryOpCompiler = typing.Callable[[ops.UnaryOp, TypedExpr], sge.Expression] + +UNARY_OP_REIGSTRATION = OpRegistration[UnaryOpCompiler]() + + +def compile(op: ops.UnaryOp, expr: TypedExpr) -> sge.Expression: + return UNARY_OP_REIGSTRATION[op](op, expr) diff --git a/bigframes/core/compile/sqlglot/scalar_compiler.py b/bigframes/core/compile/sqlglot/scalar_compiler.py index 59deb9c8f3..f553518300 100644 --- a/bigframes/core/compile/sqlglot/scalar_compiler.py +++ b/bigframes/core/compile/sqlglot/scalar_compiler.py @@ -13,25 +13,22 @@ # limitations under the License. from __future__ import annotations -import dataclasses import functools import sqlglot.expressions as sge -from bigframes import dtypes from bigframes.core import expression +from bigframes.core.compile.sqlglot.expressions import ( + binary_compiler, + nary_compiler, + ternary_compiler, + typed_expr, + unary_compiler, +) import bigframes.core.compile.sqlglot.sqlglot_ir as ir import bigframes.operations as ops -@dataclasses.dataclass(frozen=True) -class TypedExpr: - """SQLGlot expression with type.""" - - expr: sge.Expression - dtype: dtypes.ExpressionType - - @functools.singledispatch def compile_scalar_expression( expression: expression.Expression, @@ -63,46 +60,21 @@ def compile_constant_expression( def compile_op_expression(expr: expression.OpExpression) -> sge.Expression: # Non-recursively compiles the children scalar expressions. args = tuple( - TypedExpr(compile_scalar_expression(input), input.output_type) + typed_expr.TypedExpr(compile_scalar_expression(input), input.output_type) for input in expr.inputs ) op = expr.op - op_name = expr.op.__class__.__name__ - method_name = f"compile_{op_name.lower()}" - method = globals().get(method_name, None) - if method is None: - raise ValueError( - f"Compilation method '{method_name}' not found for operator '{op_name}'." - ) - if isinstance(op, ops.UnaryOp): - return method(op, args[0]) + return unary_compiler.compile(op, args[0]) elif isinstance(op, ops.BinaryOp): - return method(op, args[0], args[1]) + return binary_compiler.compile(op, args[0], args[1]) elif isinstance(op, ops.TernaryOp): - return method(op, args[0], args[1], args[2]) + return ternary_compiler.compile(op, args[0], args[1], args[2]) elif isinstance(op, ops.NaryOp): - return method(op, *args) + return nary_compiler.compile(op, *args) else: raise TypeError( - f"Operator '{op_name}' has an unrecognized arity or type " + f"Operator '{op.name}' has an unrecognized arity or type " "and cannot be compiled." ) - - -# TODO: add parenthesize for operators -def compile_addop(op: ops.AddOp, left: TypedExpr, right: TypedExpr) -> sge.Expression: - if left.dtype == dtypes.STRING_DTYPE and right.dtype == dtypes.STRING_DTYPE: - # String addition - return sge.Concat(expressions=[left.expr, right.expr]) - - # Numerical addition - return sge.Add(this=left.expr, expression=right.expr) - - -def compile_ge( - op: ops.ge_op, left: TypedExpr, right: TypedExpr # type: ignore[valid-type] -) -> sge.Expression: - - return sge.GTE(this=left.expr, expression=right.expr) From bc885bd1ea7987c60d3a73c18329461a40adb9f0 Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Tue, 24 Jun 2025 17:10:32 -0700 Subject: [PATCH 09/28] chore: add compile_explode (#1848) Fixes internal issue 427306238 --- bigframes/core/compile/sqlglot/compiler.py | 8 ++ bigframes/core/compile/sqlglot/sqlglot_ir.py | 90 +++++++++++++++++++ .../test_compile_explode_dataframe/out.sql | 21 +++++ .../test_compile_explode_series/out.sql | 18 ++++ .../compile/sqlglot/test_compile_explode.py | 31 +++++++ .../bigframes_vendored/pandas/core/frame.py | 1 + 6 files changed, 169 insertions(+) create mode 100644 tests/unit/core/compile/sqlglot/snapshots/test_compile_explode/test_compile_explode_dataframe/out.sql create mode 100644 tests/unit/core/compile/sqlglot/snapshots/test_compile_explode/test_compile_explode_series/out.sql create mode 100644 tests/unit/core/compile/sqlglot/test_compile_explode.py diff --git a/bigframes/core/compile/sqlglot/compiler.py b/bigframes/core/compile/sqlglot/compiler.py index a38078b687..606fe41b5e 100644 --- a/bigframes/core/compile/sqlglot/compiler.py +++ b/bigframes/core/compile/sqlglot/compiler.py @@ -229,6 +229,14 @@ def compile_concat( uid_gen=self.uid_gen, ) + @_compile_node.register + def compile_explode( + self, node: nodes.ExplodeNode, child: ir.SQLGlotIR + ) -> ir.SQLGlotIR: + offsets_col = node.offsets_col.sql if (node.offsets_col is not None) else None + columns = tuple(ref.id.sql for ref in node.column_ids) + return child.explode(columns, offsets_col) + def _replace_unsupported_ops(node: nodes.BigFrameNode): node = nodes.bottom_up(node, rewrite.rewrite_slice) diff --git a/bigframes/core/compile/sqlglot/sqlglot_ir.py b/bigframes/core/compile/sqlglot/sqlglot_ir.py index 6b805802b0..6bc2b55162 100644 --- a/bigframes/core/compile/sqlglot/sqlglot_ir.py +++ b/bigframes/core/compile/sqlglot/sqlglot_ir.py @@ -290,6 +290,96 @@ def replace( ).sql(dialect=self.dialect, pretty=self.pretty) return f"{merge_str}\n{whens_str}" + def explode( + self, + column_names: tuple[str, ...], + offsets_col: typing.Optional[str], + ) -> SQLGlotIR: + num_columns = len(list(column_names)) + assert num_columns > 0, "At least one column must be provided for explode." + if num_columns == 1: + return self._explode_single_column(column_names[0], offsets_col) + else: + return self._explode_multiple_columns(column_names, offsets_col) + + def _explode_single_column( + self, column_name: str, offsets_col: typing.Optional[str] + ) -> SQLGlotIR: + """Helper method to handle the case of exploding a single column.""" + + offset = ( + sge.to_identifier(offsets_col, quoted=self.quoted) if offsets_col else None + ) + column = sge.to_identifier(column_name, quoted=self.quoted) + unnested_column_alias = sge.to_identifier( + next(self.uid_gen.get_uid_stream("bfcol_")), quoted=self.quoted + ) + unnest_expr = sge.Unnest( + expressions=[column], + alias=sge.TableAlias(columns=[unnested_column_alias]), + offset=offset, + ) + selection = sge.Star(replace=[unnested_column_alias.as_(column)]) + # TODO: "CROSS" if not keep_empty else "LEFT" + # TODO: overlaps_with_parent to replace existing column. + new_expr = ( + self._encapsulate_as_cte() + .select(selection, append=False) + .join(unnest_expr, join_type="CROSS") + ) + return SQLGlotIR(expr=new_expr, uid_gen=self.uid_gen) + + def _explode_multiple_columns( + self, + column_names: tuple[str, ...], + offsets_col: typing.Optional[str], + ) -> SQLGlotIR: + """Helper method to handle the case of exploding multiple columns.""" + offset = ( + sge.to_identifier(offsets_col, quoted=self.quoted) if offsets_col else None + ) + columns = [ + sge.to_identifier(column_name, quoted=self.quoted) + for column_name in column_names + ] + + # If there are multiple columns, we need to unnest by zipping the arrays: + # https://cloud.google.com/bigquery/docs/arrays#zipping_arrays + column_lengths = [ + sge.func("ARRAY_LENGTH", sge.to_identifier(column, quoted=self.quoted)) - 1 + for column in columns + ] + generate_array = sge.func( + "GENERATE_ARRAY", + sge.convert(0), + sge.func("LEAST", *column_lengths), + ) + unnested_offset_alias = sge.to_identifier( + next(self.uid_gen.get_uid_stream("bfcol_")), quoted=self.quoted + ) + unnest_expr = sge.Unnest( + expressions=[generate_array], + alias=sge.TableAlias(columns=[unnested_offset_alias]), + offset=offset, + ) + selection = sge.Star( + replace=[ + sge.Bracket( + this=column, + expressions=[unnested_offset_alias], + safe=True, + offset=False, + ).as_(column) + for column in columns + ] + ) + new_expr = ( + self._encapsulate_as_cte() + .select(selection, append=False) + .join(unnest_expr, join_type="CROSS") + ) + return SQLGlotIR(expr=new_expr, uid_gen=self.uid_gen) + def _encapsulate_as_cte( self, ) -> sge.Select: diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_explode/test_compile_explode_dataframe/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_explode/test_compile_explode_dataframe/out.sql new file mode 100644 index 0000000000..679da58f44 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_explode/test_compile_explode_dataframe/out.sql @@ -0,0 +1,21 @@ +WITH `bfcte_0` AS ( + SELECT + `rowindex` AS `bfcol_0`, + `int_list_col` AS `bfcol_1`, + `string_list_col` AS `bfcol_2` + FROM `bigframes-dev`.`sqlglot_test`.`repeated_types` +), `bfcte_1` AS ( + SELECT + * + REPLACE (`bfcol_1`[SAFE_OFFSET(`bfcol_13`)] AS `bfcol_1`, `bfcol_2`[SAFE_OFFSET(`bfcol_13`)] AS `bfcol_2`) + FROM `bfcte_0` + CROSS JOIN UNNEST(GENERATE_ARRAY(0, LEAST(ARRAY_LENGTH(`bfcol_1`) - 1, ARRAY_LENGTH(`bfcol_2`) - 1))) AS `bfcol_13` WITH OFFSET AS `bfcol_7` +) +SELECT + `bfcol_0` AS `rowindex`, + `bfcol_0` AS `rowindex_1`, + `bfcol_1` AS `int_list_col`, + `bfcol_2` AS `string_list_col` +FROM `bfcte_1` +ORDER BY + `bfcol_7` ASC NULLS LAST \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_explode/test_compile_explode_series/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_explode/test_compile_explode_series/out.sql new file mode 100644 index 0000000000..8bfd1eb005 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_explode/test_compile_explode_series/out.sql @@ -0,0 +1,18 @@ +WITH `bfcte_0` AS ( + SELECT + `rowindex` AS `bfcol_0`, + `int_list_col` AS `bfcol_1` + FROM `bigframes-dev`.`sqlglot_test`.`repeated_types` +), `bfcte_1` AS ( + SELECT + * + REPLACE (`bfcol_8` AS `bfcol_1`) + FROM `bfcte_0` + CROSS JOIN UNNEST(`bfcol_1`) AS `bfcol_8` WITH OFFSET AS `bfcol_4` +) +SELECT + `bfcol_0` AS `rowindex`, + `bfcol_1` AS `int_list_col` +FROM `bfcte_1` +ORDER BY + `bfcol_4` ASC NULLS LAST \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/test_compile_explode.py b/tests/unit/core/compile/sqlglot/test_compile_explode.py new file mode 100644 index 0000000000..34adbbd23a --- /dev/null +++ b/tests/unit/core/compile/sqlglot/test_compile_explode.py @@ -0,0 +1,31 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +import bigframes.pandas as bpd + +pytest.importorskip("pytest_snapshot") + + +# TODO: check order by with offset +def test_compile_explode_series(repeated_types_df: bpd.DataFrame, snapshot): + s = repeated_types_df["int_list_col"].explode() + snapshot.assert_match(s.to_frame().sql, "out.sql") + + +def test_compile_explode_dataframe(repeated_types_df: bpd.DataFrame, snapshot): + exploded_columns = ["int_list_col", "string_list_col"] + df = repeated_types_df[["rowindex", *exploded_columns]].explode(exploded_columns) + snapshot.assert_match(df.sql, "out.sql") diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 224fe25f16..0606032d34 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -4124,6 +4124,7 @@ def explode( **Examples:** >>> import bigframes.pandas as bpd + >>> import numpy as np >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({'A': [[0, 1, 2], [], [], [3, 4]], From c88a825342dfd6ec5a8a141b79c7da3a3b1dea83 Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Wed, 25 Jun 2025 11:56:38 -0700 Subject: [PATCH 10/28] chore: add array operators to SQLGlot compiler (#1852) * [WIP] Add array operators. Need to finish tests * add tests * fix lint * fix typos * Use sge.Bracket() for safe_offset --- .../sqlglot/expressions/binary_compiler.py | 15 ++---- .../sqlglot/expressions/nary_compiler.py | 9 +--- .../sqlglot/expressions/op_registration.py | 30 ++++++++---- .../sqlglot/expressions/ternary_compiler.py | 10 +--- .../sqlglot/expressions/unary_compiler.py | 49 +++++++++++++++++-- .../compile/sqlglot/expressions/__init__.py | 13 +++++ .../test_add_numeric}/out.sql | 0 .../test_add_numeric_w_scalar}/out.sql | 0 .../test_add_string}/out.sql | 0 .../test_array_index/out.sql | 15 ++++++ .../test_array_slice_with_only_start/out.sql | 21 ++++++++ .../out.sql | 21 ++++++++ .../test_array_to_string/out.sql | 15 ++++++ .../out.sql | 16 ++++++ .../test_compile_string_add/out.sql | 16 ++++++ .../test_binary_compiler.py} | 12 +++-- .../expressions/test_op_registration.py | 43 ++++++++++++++++ .../expressions/test_unary_compiler.py | 44 +++++++++++++++++ 18 files changed, 287 insertions(+), 42 deletions(-) create mode 100644 tests/unit/core/compile/sqlglot/expressions/__init__.py rename tests/unit/core/compile/sqlglot/{snapshots/test_compile_scalar_expr/test_compile_numerical_add => expressions/snapshots/test_binary_compiler/test_add_numeric}/out.sql (100%) rename tests/unit/core/compile/sqlglot/{snapshots/test_compile_scalar_expr/test_compile_numerical_add_w_scalar => expressions/snapshots/test_binary_compiler/test_add_numeric_w_scalar}/out.sql (100%) rename tests/unit/core/compile/sqlglot/{snapshots/test_compile_scalar_expr/test_compile_string_add => expressions/snapshots/test_binary_compiler/test_add_string}/out.sql (100%) create mode 100644 tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_array_index/out.sql create mode 100644 tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_array_slice_with_only_start/out.sql create mode 100644 tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_array_slice_with_start_and_stop/out.sql create mode 100644 tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_array_to_string/out.sql create mode 100644 tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_compile_numerical_add_w_scalar/out.sql create mode 100644 tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_compile_string_add/out.sql rename tests/unit/core/compile/sqlglot/{test_compile_scalar_expr.py => expressions/test_binary_compiler.py} (82%) create mode 100644 tests/unit/core/compile/sqlglot/expressions/test_op_registration.py create mode 100644 tests/unit/core/compile/sqlglot/expressions/test_unary_compiler.py diff --git a/bigframes/core/compile/sqlglot/expressions/binary_compiler.py b/bigframes/core/compile/sqlglot/expressions/binary_compiler.py index 447b5af860..ec75d3a3a4 100644 --- a/bigframes/core/compile/sqlglot/expressions/binary_compiler.py +++ b/bigframes/core/compile/sqlglot/expressions/binary_compiler.py @@ -14,8 +14,6 @@ from __future__ import annotations -import typing - import sqlglot.expressions as sge from bigframes import dtypes @@ -23,17 +21,15 @@ from bigframes.core.compile.sqlglot.expressions.op_registration import OpRegistration from bigframes.core.compile.sqlglot.expressions.typed_expr import TypedExpr -BinaryOpCompiler = typing.Callable[[ops.BinaryOp, TypedExpr, TypedExpr], sge.Expression] - -BINARY_OP_REIGSTRATION = OpRegistration[BinaryOpCompiler]() +BINARY_OP_REGISTRATION = OpRegistration() def compile(op: ops.BinaryOp, left: TypedExpr, right: TypedExpr) -> sge.Expression: - return BINARY_OP_REIGSTRATION[op](op, left, right) + return BINARY_OP_REGISTRATION[op](op, left, right) # TODO: add parenthesize for operators -@BINARY_OP_REIGSTRATION.register(ops.add_op) +@BINARY_OP_REGISTRATION.register(ops.add_op) def _(op, left: TypedExpr, right: TypedExpr) -> sge.Expression: if left.dtype == dtypes.STRING_DTYPE and right.dtype == dtypes.STRING_DTYPE: # String addition @@ -43,7 +39,6 @@ def _(op, left: TypedExpr, right: TypedExpr) -> sge.Expression: return sge.Add(this=left.expr, expression=right.expr) -@BINARY_OP_REIGSTRATION.register(ops.ge_op) -def compile_ge(op, left: TypedExpr, right: TypedExpr) -> sge.Expression: - +@BINARY_OP_REGISTRATION.register(ops.ge_op) +def _(op, left: TypedExpr, right: TypedExpr) -> sge.Expression: return sge.GTE(this=left.expr, expression=right.expr) diff --git a/bigframes/core/compile/sqlglot/expressions/nary_compiler.py b/bigframes/core/compile/sqlglot/expressions/nary_compiler.py index d470009c2c..12f68613d7 100644 --- a/bigframes/core/compile/sqlglot/expressions/nary_compiler.py +++ b/bigframes/core/compile/sqlglot/expressions/nary_compiler.py @@ -14,19 +14,14 @@ from __future__ import annotations -import typing - import sqlglot.expressions as sge from bigframes import operations as ops from bigframes.core.compile.sqlglot.expressions.op_registration import OpRegistration from bigframes.core.compile.sqlglot.expressions.typed_expr import TypedExpr -# No simpler way to specify that the compilation function expects varargs. -NaryOpCompiler = typing.Callable[..., sge.Expression] - -NARY_OP_REIGSTRATION = OpRegistration[NaryOpCompiler]() +NARY_OP_REGISTRATION = OpRegistration() def compile(op: ops.NaryOp, *args: TypedExpr) -> sge.Expression: - return NARY_OP_REIGSTRATION[op](op, *args) + return NARY_OP_REGISTRATION[op](op, *args) diff --git a/bigframes/core/compile/sqlglot/expressions/op_registration.py b/bigframes/core/compile/sqlglot/expressions/op_registration.py index c2290e7c93..e30b58a6d2 100644 --- a/bigframes/core/compile/sqlglot/expressions/op_registration.py +++ b/bigframes/core/compile/sqlglot/expressions/op_registration.py @@ -15,30 +15,40 @@ from __future__ import annotations import typing -from typing import Generic, TypeVar + +from sqlglot import expressions as sge from bigframes import operations as ops -T = TypeVar("T") +# We should've been more specific about input types. Unfortunately, +# MyPy doesn't support more rigorous checks. +CompilationFunc = typing.Callable[..., sge.Expression] -class OpRegistration(Generic[T]): - _registered_ops: dict[str, T] = {} +class OpRegistration: + def __init__(self) -> None: + self._registered_ops: dict[str, CompilationFunc] = {} def register( self, op: ops.ScalarOp | type[ops.ScalarOp] - ) -> typing.Callable[[T], T]: - key = typing.cast(str, op.name) - - def decorator(item: T): + ) -> typing.Callable[[CompilationFunc], CompilationFunc]: + def decorator(item: CompilationFunc): + def arg_checker(*args, **kwargs): + if not isinstance(args[0], ops.ScalarOp): + raise ValueError( + f"The first parameter must be an operator. Got {type(args[0])}" + ) + return item(*args, **kwargs) + + key = typing.cast(str, op.name) if key in self._registered_ops: raise ValueError(f"{key} is already registered") self._registered_ops[key] = item - return item + return arg_checker return decorator - def __getitem__(self, key: str | ops.ScalarOp) -> T: + def __getitem__(self, key: str | ops.ScalarOp) -> CompilationFunc: if isinstance(key, ops.ScalarOp): return self._registered_ops[key.name] return self._registered_ops[key] diff --git a/bigframes/core/compile/sqlglot/expressions/ternary_compiler.py b/bigframes/core/compile/sqlglot/expressions/ternary_compiler.py index ee33b48c17..9b00771f7d 100644 --- a/bigframes/core/compile/sqlglot/expressions/ternary_compiler.py +++ b/bigframes/core/compile/sqlglot/expressions/ternary_compiler.py @@ -14,22 +14,16 @@ from __future__ import annotations -import typing - import sqlglot.expressions as sge from bigframes import operations as ops from bigframes.core.compile.sqlglot.expressions.op_registration import OpRegistration from bigframes.core.compile.sqlglot.expressions.typed_expr import TypedExpr -TernaryOpCompiler = typing.Callable[ - [ops.TernaryOp, TypedExpr, TypedExpr, TypedExpr], sge.Expression -] - -TERNATRY_OP_REIGSTRATION = OpRegistration[TernaryOpCompiler]() +TERNATRY_OP_REGISTRATION = OpRegistration() def compile( op: ops.TernaryOp, expr1: TypedExpr, expr2: TypedExpr, expr3: TypedExpr ) -> sge.Expression: - return TERNATRY_OP_REIGSTRATION[op](op, expr1, expr2, expr3) + return TERNATRY_OP_REGISTRATION[op](op, expr1, expr2, expr3) diff --git a/bigframes/core/compile/sqlglot/expressions/unary_compiler.py b/bigframes/core/compile/sqlglot/expressions/unary_compiler.py index 56951a58dd..716917b455 100644 --- a/bigframes/core/compile/sqlglot/expressions/unary_compiler.py +++ b/bigframes/core/compile/sqlglot/expressions/unary_compiler.py @@ -16,16 +16,57 @@ import typing +import sqlglot import sqlglot.expressions as sge from bigframes import operations as ops from bigframes.core.compile.sqlglot.expressions.op_registration import OpRegistration from bigframes.core.compile.sqlglot.expressions.typed_expr import TypedExpr -UnaryOpCompiler = typing.Callable[[ops.UnaryOp, TypedExpr], sge.Expression] - -UNARY_OP_REIGSTRATION = OpRegistration[UnaryOpCompiler]() +UNARY_OP_REGISTRATION = OpRegistration() def compile(op: ops.UnaryOp, expr: TypedExpr) -> sge.Expression: - return UNARY_OP_REIGSTRATION[op](op, expr) + return UNARY_OP_REGISTRATION[op](op, expr) + + +@UNARY_OP_REGISTRATION.register(ops.ArrayToStringOp) +def _(op: ops.ArrayToStringOp, expr: TypedExpr) -> sge.Expression: + return sge.ArrayToString(this=expr.expr, expression=f"'{op.delimiter}'") + + +@UNARY_OP_REGISTRATION.register(ops.ArrayIndexOp) +def _(op: ops.ArrayIndexOp, expr: TypedExpr) -> sge.Expression: + return sge.Bracket( + this=expr.expr, + expressions=[sge.Literal.number(op.index)], + safe=True, + offset=False, + ) + + +@UNARY_OP_REGISTRATION.register(ops.ArraySliceOp) +def _(op: ops.ArraySliceOp, expr: TypedExpr) -> sge.Expression: + slice_idx = sqlglot.to_identifier("slice_idx") + + conditions: typing.List[sge.Predicate] = [slice_idx >= op.start] + + if op.stop is not None: + conditions.append(slice_idx < op.stop) + + # local name for each element in the array + el = sqlglot.to_identifier("el") + + selected_elements = ( + sge.select(el) + .from_( + sge.Unnest( + expressions=[expr.expr], + alias=sge.TableAlias(columns=[el]), + offset=slice_idx, + ) + ) + .where(*conditions) + ) + + return sge.array(selected_elements) diff --git a/tests/unit/core/compile/sqlglot/expressions/__init__.py b/tests/unit/core/compile/sqlglot/expressions/__init__.py new file mode 100644 index 0000000000..0a2669d7a2 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_scalar_expr/test_compile_numerical_add/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_add_numeric/out.sql similarity index 100% rename from tests/unit/core/compile/sqlglot/snapshots/test_compile_scalar_expr/test_compile_numerical_add/out.sql rename to tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_add_numeric/out.sql diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_scalar_expr/test_compile_numerical_add_w_scalar/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_add_numeric_w_scalar/out.sql similarity index 100% rename from tests/unit/core/compile/sqlglot/snapshots/test_compile_scalar_expr/test_compile_numerical_add_w_scalar/out.sql rename to tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_add_numeric_w_scalar/out.sql diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_scalar_expr/test_compile_string_add/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_add_string/out.sql similarity index 100% rename from tests/unit/core/compile/sqlglot/snapshots/test_compile_scalar_expr/test_compile_string_add/out.sql rename to tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_add_string/out.sql diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_array_index/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_array_index/out.sql new file mode 100644 index 0000000000..33a8bded13 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_array_index/out.sql @@ -0,0 +1,15 @@ +WITH `bfcte_0` AS ( + SELECT + `rowindex` AS `bfcol_0`, + `string_list_col` AS `bfcol_1` + FROM `bigframes-dev`.`sqlglot_test`.`repeated_types` +), `bfcte_1` AS ( + SELECT + *, + `bfcol_1`[SAFE_OFFSET(1)] AS `bfcol_4` + FROM `bfcte_0` +) +SELECT + `bfcol_0` AS `rowindex`, + `bfcol_4` AS `string_list_col` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_array_slice_with_only_start/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_array_slice_with_only_start/out.sql new file mode 100644 index 0000000000..34d2225931 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_array_slice_with_only_start/out.sql @@ -0,0 +1,21 @@ +WITH `bfcte_0` AS ( + SELECT + `rowindex` AS `bfcol_0`, + `string_list_col` AS `bfcol_1` + FROM `bigframes-dev`.`sqlglot_test`.`repeated_types` +), `bfcte_1` AS ( + SELECT + *, + ARRAY( + SELECT + el + FROM UNNEST(`bfcol_1`) AS el WITH OFFSET AS slice_idx + WHERE + slice_idx >= 1 + ) AS `bfcol_4` + FROM `bfcte_0` +) +SELECT + `bfcol_0` AS `rowindex`, + `bfcol_4` AS `string_list_col` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_array_slice_with_start_and_stop/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_array_slice_with_start_and_stop/out.sql new file mode 100644 index 0000000000..d46803ce7c --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_array_slice_with_start_and_stop/out.sql @@ -0,0 +1,21 @@ +WITH `bfcte_0` AS ( + SELECT + `rowindex` AS `bfcol_0`, + `string_list_col` AS `bfcol_1` + FROM `bigframes-dev`.`sqlglot_test`.`repeated_types` +), `bfcte_1` AS ( + SELECT + *, + ARRAY( + SELECT + el + FROM UNNEST(`bfcol_1`) AS el WITH OFFSET AS slice_idx + WHERE + slice_idx >= 1 AND slice_idx < 5 + ) AS `bfcol_4` + FROM `bfcte_0` +) +SELECT + `bfcol_0` AS `rowindex`, + `bfcol_4` AS `string_list_col` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_array_to_string/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_array_to_string/out.sql new file mode 100644 index 0000000000..e0db21f972 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_array_to_string/out.sql @@ -0,0 +1,15 @@ +WITH `bfcte_0` AS ( + SELECT + `rowindex` AS `bfcol_0`, + `string_list_col` AS `bfcol_1` + FROM `bigframes-dev`.`sqlglot_test`.`repeated_types` +), `bfcte_1` AS ( + SELECT + *, + ARRAY_TO_STRING(`bfcol_1`, '.') AS `bfcol_4` + FROM `bfcte_0` +) +SELECT + `bfcol_0` AS `rowindex`, + `bfcol_4` AS `string_list_col` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_compile_numerical_add_w_scalar/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_compile_numerical_add_w_scalar/out.sql new file mode 100644 index 0000000000..9c4b01a6df --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_compile_numerical_add_w_scalar/out.sql @@ -0,0 +1,16 @@ +WITH `bfcte_0` AS ( + SELECT + `int64_col` AS `bfcol_0`, + `rowindex` AS `bfcol_1` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + `bfcol_1` AS `bfcol_4`, + `bfcol_0` + 1 AS `bfcol_5` + FROM `bfcte_0` +) +SELECT + `bfcol_4` AS `rowindex`, + `bfcol_5` AS `int64_col` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_compile_string_add/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_compile_string_add/out.sql new file mode 100644 index 0000000000..7a8ab83df1 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_compile_string_add/out.sql @@ -0,0 +1,16 @@ +WITH `bfcte_0` AS ( + SELECT + `rowindex` AS `bfcol_0`, + `string_col` AS `bfcol_1` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + `bfcol_0` AS `bfcol_4`, + CONCAT(`bfcol_1`, 'a') AS `bfcol_5` + FROM `bfcte_0` +) +SELECT + `bfcol_4` AS `rowindex`, + `bfcol_5` AS `string_col` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/test_compile_scalar_expr.py b/tests/unit/core/compile/sqlglot/expressions/test_binary_compiler.py similarity index 82% rename from tests/unit/core/compile/sqlglot/test_compile_scalar_expr.py rename to tests/unit/core/compile/sqlglot/expressions/test_binary_compiler.py index 862ee2467c..180d43d771 100644 --- a/tests/unit/core/compile/sqlglot/test_compile_scalar_expr.py +++ b/tests/unit/core/compile/sqlglot/expressions/test_binary_compiler.py @@ -19,19 +19,25 @@ pytest.importorskip("pytest_snapshot") -def test_compile_numerical_add(scalars_types_df: bpd.DataFrame, snapshot): +def test_add_numeric(scalars_types_df: bpd.DataFrame, snapshot): bf_df = scalars_types_df[["int64_col"]] + bf_df["int64_col"] = bf_df["int64_col"] + bf_df["int64_col"] + snapshot.assert_match(bf_df.sql, "out.sql") -def test_compile_numerical_add_w_scalar(scalars_types_df: bpd.DataFrame, snapshot): +def test_add_numeric_w_scalar(scalars_types_df: bpd.DataFrame, snapshot): bf_df = scalars_types_df[["int64_col"]] + bf_df["int64_col"] = bf_df["int64_col"] + 1 + snapshot.assert_match(bf_df.sql, "out.sql") -def test_compile_string_add(scalars_types_df: bpd.DataFrame, snapshot): +def test_add_string(scalars_types_df: bpd.DataFrame, snapshot): bf_df = scalars_types_df[["string_col"]] + bf_df["string_col"] = bf_df["string_col"] + "a" + snapshot.assert_match(bf_df.sql, "out.sql") diff --git a/tests/unit/core/compile/sqlglot/expressions/test_op_registration.py b/tests/unit/core/compile/sqlglot/expressions/test_op_registration.py new file mode 100644 index 0000000000..1c49dde6ca --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/test_op_registration.py @@ -0,0 +1,43 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from sqlglot import expressions as sge + +from bigframes.core.compile.sqlglot.expressions import op_registration +from bigframes.operations import numeric_ops + + +def test_register_then_get(): + reg = op_registration.OpRegistration() + input = sge.to_identifier("A") + op = numeric_ops.add_op + + @reg.register(numeric_ops.AddOp) + def test_func(op: numeric_ops.AddOp, input: sge.Expression) -> sge.Expression: + return input + + assert reg[numeric_ops.add_op](op, input) == test_func(op, input) + assert reg[numeric_ops.add_op.name](op, input) == test_func(op, input) + + +def test_register_function_first_argument_is_not_scalar_op_raise_error(): + reg = op_registration.OpRegistration() + + @reg.register(numeric_ops.AddOp) + def test_func(input: sge.Expression) -> sge.Expression: + return input + + with pytest.raises(ValueError, match=r".*first parameter must be an operator.*"): + test_func(sge.to_identifier("A")) diff --git a/tests/unit/core/compile/sqlglot/expressions/test_unary_compiler.py b/tests/unit/core/compile/sqlglot/expressions/test_unary_compiler.py new file mode 100644 index 0000000000..317c2f891b --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/test_unary_compiler.py @@ -0,0 +1,44 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +from bigframes import bigquery +import bigframes.pandas as bpd + +pytest.importorskip("pytest_snapshot") + + +def test_array_to_string(repeated_types_df: bpd.DataFrame, snapshot): + result = bigquery.array_to_string(repeated_types_df["string_list_col"], ".") + + snapshot.assert_match(result.to_frame().sql, "out.sql") + + +def test_array_index(repeated_types_df: bpd.DataFrame, snapshot): + result = repeated_types_df["string_list_col"].list[1] + + snapshot.assert_match(result.to_frame().sql, "out.sql") + + +def test_array_slice_with_only_start(repeated_types_df: bpd.DataFrame, snapshot): + result = repeated_types_df["string_list_col"].list[1:] + + snapshot.assert_match(result.to_frame().sql, "out.sql") + + +def test_array_slice_with_start_and_stop(repeated_types_df: bpd.DataFrame, snapshot): + result = repeated_types_df["string_list_col"].list[1:5] + + snapshot.assert_match(result.to_frame().sql, "out.sql") From 80bac0ff6f4091b34ccae4ee5cb045e684c8c98c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Wed, 25 Jun 2025 16:50:45 -0500 Subject: [PATCH 11/28] chore: add script to generate `_read_gbq_colab` BigQuery benchmark tables (#1846) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Add script to generate BigQuery benchmark tables This script creates 10 BigQuery tables with varying schemas and data volumes based on predefined statistics. Key features: - Dynamically generates table schemas to match target average row sizes, maximizing data type diversity. - Generates random data for each table, respecting BigQuery data types. - Includes placeholders for GCP project and dataset IDs. - Handles very large table data generation by capping row numbers for in-memory processing and printing warnings (actual BQ load for huge tables would require GCS load jobs). - Adds a specific requirements file for this script: `scripts/requirements-create_tables.txt`. * Refactor: Vectorize data generation in benchmark script Vectorized the `generate_random_data` function in `scripts/create_read_gbq_colab_benchmark_tables.py`. Changes include: - Using NumPy's vectorized operations (`size` parameter in random functions, `np.vectorize`) to generate arrays of random values for most data types at once. - Employing list comprehensions for transformations on these arrays (e.g., formatting dates, generating strings from character arrays). - Retaining loops for types where full vectorization is overly complex or offers little benefit (e.g., precise byte-length JSON strings, BYTES generation via `rng.bytes`). - Assembling the final list of row dictionaries from the generated columnar data. This should improve performance for data generation, especially for tables with a large number of rows. * Implement batched data generation and loading Refactored the script to process data in batches, significantly improving memory efficiency for large tables. Changes include: 1. `generate_random_data` function: * Modified to be a generator, yielding data in chunks of a specified `batch_size`. * The core vectorized logic for creating column data within each batch is retained. 2. `create_and_load_table` function: * Updated to consume data from the `generate_random_data` generator. * No longer accepts a full list of data rows. * For actual BigQuery loads, it iterates through generated batches and further sub-batches them (if necessary) for optimal `client.insert_rows_json` calls. * Simulation mode now reflects this batched processing by showing details of the first generated batch and estimated total batches. 3. `main` function: * Removed pre-generation of the entire dataset or a capped sample. * The call to `create_and_load_table` now passes parameters required for it to invoke and manage the data generator (total `num_rows`, `rng` object, and `DATA_GENERATION_BATCH_SIZE`). * Optimize DATETIME/TIMESTAMP generation with numpy.datetime_as_string Refactored the `generate_random_data` function to use `numpy.datetime_as_string` for converting `numpy.datetime64` arrays to ISO-formatted strings for DATETIME and TIMESTAMP columns. - For DATETIME: - Python `datetime.datetime` objects are created in a list first (to ensure date component validity) then converted to `numpy.datetime64[us]`. - `numpy.datetime_as_string` is used, and the output 'T' separator is replaced with a space. - For TIMESTAMP: - `numpy.datetime64[us]` arrays are constructed directly from epoch seconds and microsecond offsets. - `numpy.datetime_as_string` is used with `timezone='UTC'` to produce a 'Z'-suffixed UTC string. This change improves performance and code clarity for generating these timestamp string formats. * Add argparse for project and dataset IDs Implemented command-line arguments for specifying Google Cloud Project ID and BigQuery Dataset ID, replacing hardcoded global constants. Changes: - Imported `argparse` module. - Added optional `--project_id` (-p) and `--dataset_id` (-d) arguments to `main()`. - If `project_id` or `dataset_id` are not provided, the script defaults to simulation mode. - `create_and_load_table` now checks for the presence of both IDs to determine if it should attempt actual BigQuery operations or run in simulation. - Error handling in `create_and_load_table` for BQ operations was adjusted to log errors per table and continue processing remaining tables, rather than halting the script. * Add unit tests for table generation script Added unit tests for `get_bq_schema` and `generate_random_data` functions in `create_read_gbq_colab_benchmark_tables.py`. - Created `scripts/create_read_gbq_colab_benchmark_tables_test.py`. - Implemented pytest-style tests covering various scenarios: - For `get_bq_schema`: - Zero and small target byte sizes. - Exact fits with fixed-size types. - Inclusion and expansion of flexible types. - Generation of all fixed types where possible. - Uniqueness of column names. - Helper function `_calculate_row_size` used for validation. - For `generate_random_data`: - Zero rows case. - Basic schema and batching logic (single batch, multiple full batches, partial last batches). - Generation of all supported data types, checking Python types, string formats (using regex and `fromisoformat`), lengths for string/bytes, and JSON validity. - Added `pytest` and `pandas` (for pytest compatibility in the current project environment) to `scripts/requirements-create_tables.txt`. - All tests pass. * refactor * reduce duplicated work * only use percentile in table name * use annotations to not fail in 3.9 * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * Update scripts/create_read_gbq_colab_benchmark_tables.py * Delete scripts/requirements-create_tables.txt * base64 encode * refactor batch generation * adjust test formatting * parallel processing --------- Co-authored-by: google-labs-jules[bot] <161369871+google-labs-jules[bot]@users.noreply.github.com> Co-authored-by: Owl Bot --- .pre-commit-config.yaml | 2 +- noxfile.py | 2 + .../create_read_gbq_colab_benchmark_tables.py | 552 ++++++++++++++++++ ...te_read_gbq_colab_benchmark_tables_test.py | 333 +++++++++++ scripts/readme-gen/readme_gen.py | 1 - 5 files changed, 888 insertions(+), 2 deletions(-) create mode 100644 scripts/create_read_gbq_colab_benchmark_tables.py create mode 100644 scripts/create_read_gbq_colab_benchmark_tables_test.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 7e46c73d0d..93cc5e4210 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -39,6 +39,6 @@ repos: rev: v1.15.0 hooks: - id: mypy - additional_dependencies: [types-requests, types-tabulate, pandas-stubs<=2.2.3.241126] + additional_dependencies: [types-requests, types-tabulate, types-PyYAML, pandas-stubs<=2.2.3.241126] exclude: "^third_party" args: ["--check-untyped-defs", "--explicit-package-bases", "--ignore-missing-imports"] diff --git a/noxfile.py b/noxfile.py index a1e8e5b99b..9346f26cba 100644 --- a/noxfile.py +++ b/noxfile.py @@ -53,6 +53,7 @@ LINT_PATHS = [ "docs", "bigframes", + "scripts", "tests", "third_party", "noxfile.py", @@ -275,6 +276,7 @@ def mypy(session): "types-requests", "types-setuptools", "types-tabulate", + "types-PyYAML", "polars", "anywidget", ] diff --git a/scripts/create_read_gbq_colab_benchmark_tables.py b/scripts/create_read_gbq_colab_benchmark_tables.py new file mode 100644 index 0000000000..703c946360 --- /dev/null +++ b/scripts/create_read_gbq_colab_benchmark_tables.py @@ -0,0 +1,552 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import argparse +import base64 +import concurrent.futures +import datetime +import json +import math +import time +from typing import Any, Iterable, MutableSequence, Sequence + +from google.cloud import bigquery +import numpy as np + +# --- Input Data --- +# Generated by querying bigquery-magics usage. See internal issue b/420984164. +TABLE_STATS: dict[str, list[float]] = { + "percentile": [9, 19, 29, 39, 49, 59, 69, 79, 89, 99], + "materialized_or_scanned_bytes": [ + 0.0, + 0.0, + 4102.0, + 76901.0, + 351693.0, + 500000.0, + 500000.0, + 1320930.0, + 17486432.0, + 1919625975.0, + ], + "num_materialized_or_scanned_rows": [ + 0.0, + 6.0, + 100.0, + 4955.0, + 23108.0, + 139504.0, + 616341.0, + 3855698.0, + 83725698.0, + 5991998082.0, + ], + "avg_row_bytes": [ + 0.00014346299635435792, + 0.005370969708923197, + 0.3692756731526246, + 4.079344721151818, + 7.5418, + 12.528863516404146, + 22.686258546389798, + 48.69689224091025, + 100.90817356205852, + 2020, + ], + "materialized_mb": [ + 0.0, + 0.0, + 0.004102, + 0.076901, + 0.351693, + 0.5, + 0.5, + 1.32093, + 17.486432, + 1919.625975, + ], +} + +BIGQUERY_DATA_TYPE_SIZES = { + "BOOL": 1, + "DATE": 8, + "FLOAT64": 8, + "INT64": 8, + "DATETIME": 8, + "TIMESTAMP": 8, + "TIME": 8, + "NUMERIC": 16, + # Flexible types. + # JSON base size is its content, BYTES/STRING have 2 byte overhead + content + "JSON": 0, + "BYTES": 2, + "STRING": 2, +} +FIXED_TYPES = [ + "BOOL", + "INT64", + "FLOAT64", + "NUMERIC", + "DATE", + "DATETIME", + "TIMESTAMP", + "TIME", +] +FLEXIBLE_TYPES = ["STRING", "BYTES", "JSON"] + +JSON_CHAR_LIST = list("abcdef") +STRING_CHAR_LIST = list("abcdefghijklmnopqrstuvwxyz0123456789") + +# --- Helper Functions --- + + +def get_bq_schema(target_row_size_bytes: int) -> Sequence[tuple[str, str, int | None]]: + """ + Determines the BigQuery table schema to match the target_row_size_bytes. + Prioritizes fixed-size types for diversity, then uses flexible types. + Returns a list of tuples: (column_name, type_name, length_for_flexible_type). + Length is None for fixed-size types. + """ + schema: MutableSequence[tuple[str, str, int | None]] = [] + current_size = 0 + col_idx = 0 + + for bq_type in FIXED_TYPES: + # For simplicity, we'll allow slight overage if only fixed fields are chosen. + if current_size >= target_row_size_bytes: + break + + type_size = BIGQUERY_DATA_TYPE_SIZES[bq_type] + schema.append((f"col_{bq_type.lower()}_{col_idx}", bq_type, None)) + current_size += type_size + col_idx += 1 + + # Use flexible-size types to fill remaining space + + # Attempt to add one of each flexible type if space allows + if current_size < target_row_size_bytes: + remaining_bytes_for_content = target_row_size_bytes - current_size + + # For simplicity, divide the remaing bytes evenly across the flexible + # columns. + target_size = int(math.ceil(remaining_bytes_for_content / len(FLEXIBLE_TYPES))) + + for bq_type in FLEXIBLE_TYPES: + base_cost = BIGQUERY_DATA_TYPE_SIZES[bq_type] + min_content_size = max(0, target_size - base_cost) + + schema.append( + (f"col_{bq_type.lower()}_{col_idx}", bq_type, min_content_size) + ) + current_size += base_cost + min_content_size + col_idx += 1 + + return schema + + +def generate_bool_batch( + num_rows: int, rng: np.random.Generator, content_length: int | None = None +) -> np.ndarray: + return rng.choice([True, False], size=num_rows) + + +def generate_int64_batch( + num_rows: int, rng: np.random.Generator, content_length: int | None = None +) -> np.ndarray: + return rng.integers(-(10**18), 10**18, size=num_rows, dtype=np.int64) + + +def generate_float64_batch( + num_rows: int, rng: np.random.Generator, content_length: int | None = None +) -> np.ndarray: + return rng.random(size=num_rows) * 2 * 10**10 - 10**10 + + +def generate_numeric_batch( + num_rows: int, rng: np.random.Generator, content_length: int | None = None +) -> np.ndarray: + raw_numerics = rng.random(size=num_rows) * 2 * 10**28 - 10**28 + format_numeric_vectorized = np.vectorize(lambda x: f"{x:.9f}") + return format_numeric_vectorized(raw_numerics) + + +def generate_date_batch( + num_rows: int, rng: np.random.Generator, content_length: int | None = None +) -> np.ndarray: + start_date_ord = datetime.date(1, 1, 1).toordinal() + max_days = (datetime.date(9999, 12, 31) - datetime.date(1, 1, 1)).days + day_offsets = rng.integers(0, max_days + 1, size=num_rows) + date_ordinals = start_date_ord + day_offsets + return np.array( + [ + datetime.date.fromordinal(int(ordinal)).isoformat() + for ordinal in date_ordinals + ] + ) + + +def generate_numpy_datetimes(num_rows: int, rng: np.random.Generator) -> np.ndarray: + # Generate seconds from a broad range (e.g., year 1 to 9999) + # Note: Python's datetime.timestamp() might be limited by system's C mktime. + # For broader range with np.datetime64, it's usually fine. + # Let's generate epoch seconds relative to Unix epoch for np.datetime64 compatibility + min_epoch_seconds = int( + datetime.datetime(1, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc).timestamp() + ) + # Max for datetime64[s] is far out, but let's bound it reasonably for BQ. + max_epoch_seconds = int( + datetime.datetime( + 9999, 12, 28, 23, 59, 59, tzinfo=datetime.timezone.utc + ).timestamp() + ) + + epoch_seconds = rng.integers( + min_epoch_seconds, + max_epoch_seconds + 1, + size=num_rows, + dtype=np.int64, + ) + microseconds_offset = rng.integers(0, 1000000, size=num_rows, dtype=np.int64) + + # Create datetime64[s] from epoch seconds and add microseconds as timedelta64[us] + np_timestamps_s = epoch_seconds.astype("datetime64[s]") + np_microseconds_td = microseconds_offset.astype("timedelta64[us]") + return np_timestamps_s + np_microseconds_td + + +def generate_datetime_batch( + num_rows: int, rng: np.random.Generator, content_length: int | None = None +) -> np.ndarray: + np_datetimes = generate_numpy_datetimes(num_rows, rng) + + # np.datetime_as_string produces 'YYYY-MM-DDTHH:MM:SS.ffffff' + # BQ DATETIME typically uses a space separator: 'YYYY-MM-DD HH:MM:SS.ffffff' + datetime_strings = np.datetime_as_string(np_datetimes, unit="us") + return np.array([s.replace("T", " ") for s in datetime_strings]) + + +def generate_timestamp_batch( + num_rows: int, rng: np.random.Generator, content_length: int | None = None +) -> np.ndarray: + np_datetimes = generate_numpy_datetimes(num_rows, rng) + + # Convert to string with UTC timezone indicator + # np.datetime_as_string with timezone='UTC' produces 'YYYY-MM-DDTHH:MM:SS.ffffffZ' + # BigQuery generally accepts this for TIMESTAMP. + return np.datetime_as_string(np_datetimes, unit="us", timezone="UTC") + + +def generate_time_batch( + num_rows: int, rng: np.random.Generator, content_length: int | None = None +) -> np.ndarray: + hours = rng.integers(0, 24, size=num_rows) + minutes = rng.integers(0, 60, size=num_rows) + seconds = rng.integers(0, 60, size=num_rows) + microseconds = rng.integers(0, 1000000, size=num_rows) + time_list = [ + datetime.time(hours[i], minutes[i], seconds[i], microseconds[i]).isoformat() + for i in range(num_rows) + ] + return np.array(time_list) + + +def generate_json_row(content_length: int, rng: np.random.Generator) -> str: + json_val_len = max(0, content_length - 5) + json_val_chars = rng.choice(JSON_CHAR_LIST, size=json_val_len) + json_obj = {"k": "".join(json_val_chars)} + return json.dumps(json_obj) + + +def generate_json_batch( + num_rows: int, rng: np.random.Generator, content_length: int | None = None +) -> np.ndarray: + content_length = content_length if content_length is not None else 10 + json_list = [ + generate_json_row(content_length=content_length, rng=rng) + for _ in range(num_rows) + ] + return np.array(json_list) + + +def generate_string_batch( + num_rows: int, rng: np.random.Generator, content_length: int | None = None +) -> np.ndarray: + content_length = content_length if content_length is not None else 1 + content_length = max(0, content_length) + chars_array = rng.choice(STRING_CHAR_LIST, size=(num_rows, content_length)) + return np.array(["".join(row_chars) for row_chars in chars_array]) + + +def generate_bytes_batch( + num_rows: int, rng: np.random.Generator, content_length: int | None = None +) -> np.ndarray: + content_length = content_length if content_length is not None else 1 + content_length = max(0, content_length) + return np.array( + [ + base64.b64encode(rng.bytes(content_length)).decode("utf-8") + for _ in range(num_rows) + ] + ) + + +BIGQUERY_DATA_TYPE_GENERATORS = { + "BOOL": generate_bool_batch, + "DATE": generate_date_batch, + "FLOAT64": generate_float64_batch, + "INT64": generate_int64_batch, + "DATETIME": generate_datetime_batch, + "TIMESTAMP": generate_timestamp_batch, + "TIME": generate_time_batch, + "NUMERIC": generate_numeric_batch, + "JSON": generate_json_batch, + "BYTES": generate_bytes_batch, + "STRING": generate_string_batch, +} + + +def generate_work_items( + table_id: str, + schema: Sequence[tuple[str, str, int | None]], + num_rows: int, + batch_size: int, +) -> Iterable[tuple[str, Sequence[tuple[str, str, int | None]], int]]: + """ + Generates work items of appropriate batch sizes. + """ + if num_rows == 0: + return + + generated_rows_total = 0 + + while generated_rows_total < num_rows: + current_batch_size = min(batch_size, num_rows - generated_rows_total) + if current_batch_size == 0: + break + + yield (table_id, schema, current_batch_size) + generated_rows_total += current_batch_size + + +def generate_batch( + schema: Sequence[tuple[str, str, int | None]], + num_rows: int, + rng: np.random.Generator, +) -> list[dict[str, Any]]: + col_names_ordered = [s[0] for s in schema] + + columns_data_batch = {} + for col_name, bq_type, length in schema: + generate_batch = BIGQUERY_DATA_TYPE_GENERATORS[bq_type] + columns_data_batch[col_name] = generate_batch( + num_rows, rng, content_length=length + ) + + # Turn numpy objects into Python objects. + # https://stackoverflow.com/a/32850511/101923 + columns_data_batch_json = {} + for column in columns_data_batch: + columns_data_batch_json[column] = columns_data_batch[column].tolist() + + # Assemble batch of rows + batch_data = [] + for i in range(num_rows): + row = { + col_name: columns_data_batch_json[col_name][i] + for col_name in col_names_ordered + } + batch_data.append(row) + + return batch_data + + +def generate_and_load_batch( + client: bigquery.Client, + table_id: str, + schema_def: Sequence[tuple[str, str, int | None]], + num_rows: int, + rng: np.random.Generator, +): + bq_schema = [] + for col_name, type_name, _ in schema_def: + bq_schema.append(bigquery.SchemaField(col_name, type_name)) + table = bigquery.Table(table_id, schema=bq_schema) + + generated_data_chunk = generate_batch(schema_def, num_rows, rng) + errors = client.insert_rows_json(table, generated_data_chunk) + if errors: + raise ValueError(f"Encountered errors while inserting sub-batch: {errors}") + + +def create_and_load_table( + client: bigquery.Client | None, + project_id: str, + dataset_id: str, + table_name: str, + schema_def: Sequence[tuple[str, str, int | None]], + num_rows: int, + executor: concurrent.futures.Executor, +): + """Creates a BigQuery table and loads data into it by consuming a data generator.""" + + if not client: + print(f"Simulating: Generated schema: {schema_def}") + return + + # BQ client library streaming insert batch size (rows per API call) + # This is different from data_gen_batch_size which is for generating data. + # We can make BQ_LOAD_BATCH_SIZE smaller than data_gen_batch_size if needed. + BQ_LOAD_BATCH_SIZE = 500 + + # Actual BigQuery operations occur here because both project_id and dataset_id are provided + print( + f"Attempting BigQuery operations for table {table_name} in project '{project_id}', dataset '{dataset_id}'." + ) + table_id = f"{project_id}.{dataset_id}.{table_name}" + + bq_schema = [] + for col_name, type_name, _ in schema_def: + bq_schema.append(bigquery.SchemaField(col_name, type_name)) + + table = bigquery.Table(table_id, schema=bq_schema) + print(f"(Re)creating table {table_id}...") + table = client.create_table(table, exists_ok=True) + print(f"Table {table_id} created successfully or already exists.") + + # Query in case there's something in the streaming buffer already. + table_rows = next( + iter(client.query_and_wait(f"SELECT COUNT(*) FROM `{table_id}`")) + )[0] + print(f"Table {table_id} has {table_rows} rows.") + num_rows = max(0, num_rows - table_rows) + + if num_rows <= 0: + print(f"No rows to load. Requested {num_rows} rows. Skipping.") + return + + print(f"Starting to load {num_rows} rows into {table_id} in batches...") + + previous_status_time = 0.0 + generated_rows_total = 0 + + for completed_rows in executor.map( + worker_process_item, + generate_work_items( + table_id, + schema_def, + num_rows, + BQ_LOAD_BATCH_SIZE, + ), + ): + generated_rows_total += completed_rows + + current_time = time.monotonic() + if current_time - previous_status_time > 5: + print(f"Wrote {generated_rows_total} out of {num_rows} rows.") + previous_status_time = current_time + + +worker_client: bigquery.Client | None = None +worker_rng: np.random.Generator | None = None + + +def worker_initializer(project_id: str | None): + global worker_client, worker_rng + + # One client per process, since multiprocessing and client connections don't + # play nicely together. + if project_id is not None: + worker_client = bigquery.Client(project=project_id) + + worker_rng = np.random.default_rng() + + +def worker_process_item( + work_item: tuple[str, Sequence[tuple[str, str, int | None]], int] +): + global worker_client, worker_rng + + if worker_client is None or worker_rng is None: + raise ValueError("Worker not initialized.") + + table_id, schema_def, num_rows = work_item + generate_and_load_batch(worker_client, table_id, schema_def, num_rows, worker_rng) + return num_rows + + +# --- Main Script Logic --- +def main(): + """Main function to create and populate BigQuery tables.""" + + parser = argparse.ArgumentParser( + description="Generate and load BigQuery benchmark tables." + ) + parser.add_argument( + "-p", + "--project_id", + type=str, + default=None, + help="Google Cloud Project ID. If not provided, script runs in simulation mode.", + ) + parser.add_argument( + "-d", + "--dataset_id", + type=str, + default=None, + help="BigQuery Dataset ID within the project. If not provided, script runs in simulation mode.", + ) + args = parser.parse_args() + + num_percentiles = len(TABLE_STATS["percentile"]) + client = None + + if args.project_id and args.dataset_id: + client = bigquery.Client(project=args.project_id) + dataset = bigquery.Dataset(f"{args.project_id}.{args.dataset_id}") + client.create_dataset(dataset, exists_ok=True) + + with concurrent.futures.ProcessPoolExecutor( + initializer=worker_initializer, initargs=(args.project_id,) + ) as executor: + for i in range(num_percentiles): + percentile = TABLE_STATS["percentile"][i] + avg_row_bytes_raw = TABLE_STATS["avg_row_bytes"][i] + num_rows_raw = TABLE_STATS["num_materialized_or_scanned_rows"][i] + + target_row_bytes = max(1, int(math.ceil(avg_row_bytes_raw))) + num_rows = max(1, int(math.ceil(num_rows_raw))) + + table_name = f"percentile_{percentile:02d}" + print(f"\n--- Processing Table: {table_name} ---") + print(f"Target average row bytes (rounded up): {target_row_bytes}") + print(f"Number of rows (rounded up): {num_rows}") + + schema_definition = get_bq_schema(target_row_bytes) + print(f"Generated Schema: {schema_definition}") + + create_and_load_table( + client, + args.project_id or "", + args.dataset_id or "", + table_name, + schema_definition, + num_rows, + executor, + ) + + +if __name__ == "__main__": + main() diff --git a/scripts/create_read_gbq_colab_benchmark_tables_test.py b/scripts/create_read_gbq_colab_benchmark_tables_test.py new file mode 100644 index 0000000000..89c49e4243 --- /dev/null +++ b/scripts/create_read_gbq_colab_benchmark_tables_test.py @@ -0,0 +1,333 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import base64 +import datetime +import json +import math +import re + +# Assuming the script to be tested is in the same directory or accessible via PYTHONPATH +from create_read_gbq_colab_benchmark_tables import ( + BIGQUERY_DATA_TYPE_SIZES, + generate_batch, + generate_work_items, + get_bq_schema, +) +import numpy as np +import pytest + + +# Helper function to calculate estimated row size from schema +def _calculate_row_size(schema: list[tuple[str, str, int | None]]) -> int: + """Calculates the estimated byte size of a row based on the schema. + Note: This is a simplified calculation for testing and might not perfectly + match BigQuery's internal storage, especially for complex types or NULLs. + """ + size = 0 + for _, bq_type, length in schema: + if bq_type in ["STRING", "BYTES", "JSON"]: + # Base cost (e.g., 2 bytes) + content length + size += BIGQUERY_DATA_TYPE_SIZES[bq_type] + ( + length if length is not None else 0 + ) + elif bq_type in BIGQUERY_DATA_TYPE_SIZES: + size += BIGQUERY_DATA_TYPE_SIZES[bq_type] + else: + raise AssertionError(f"Got unexpected type {bq_type}") + return size + + +# --- Tests for get_bq_schema --- + + +def test_get_bq_schema_zero_bytes(): + assert get_bq_schema(0) == [] + + +def test_get_bq_schema_one_byte(): + schema = get_bq_schema(1) + + assert len(schema) == 1 + assert schema[0][1] == "BOOL" # ('col_bool_fallback_0', 'BOOL', None) or similar + assert _calculate_row_size(schema) == 1 + + +def test_get_bq_schema_exact_fixed_fit(): + # BOOL (1) + INT64 (8) = 9 bytes + target_size = 9 + schema = get_bq_schema(target_size) + + assert len(schema) == 2 + assert schema[0][1] == "BOOL" + assert schema[1][1] == "INT64" + assert _calculate_row_size(schema) == target_size + + +def test_get_bq_schema_needs_flexible_string(): + # Sum of all fixed types: + # BOOL 1, INT64 8, FLOAT64 8, NUMERIC 16, DATE 8, DATETIME 8, TIMESTAMP 8, TIME 8 + # Total = 1+8+8+16+8+8+8+8 = 65 + target_size = 65 + 1 + schema = get_bq_schema(target_size) + + assert _calculate_row_size(schema) == 65 + 2 + 2 + 1 + + string_cols = [s for s in schema if s[1] == "STRING"] + assert len(string_cols) == 1 + assert string_cols[0][2] == 0 + + bytes_cols = [s for s in schema if s[1] == "BYTES"] + assert len(bytes_cols) == 1 + assert bytes_cols[0][2] == 0 + + json_cols = [s for s in schema if s[1] == "JSON"] + assert len(json_cols) == 1 + assert json_cols[0][2] == 1 + + +def test_get_bq_schema_flexible_expansion(): + # Sum of all fixed types: + # BOOL 1, INT64 8, FLOAT64 8, NUMERIC 16, DATE 8, DATETIME 8, TIMESTAMP 8, TIME 8 + # Total = 1+8+8+16+8+8+8+8 = 65 + target_size = 65 + 3 * 5 + schema = get_bq_schema(target_size) + + assert _calculate_row_size(schema) == target_size + + string_cols = [s for s in schema if s[1] == "STRING"] + assert len(string_cols) == 1 + assert string_cols[0][2] == 3 + + bytes_cols = [s for s in schema if s[1] == "BYTES"] + assert len(bytes_cols) == 1 + assert bytes_cols[0][2] == 3 + + json_cols = [s for s in schema if s[1] == "JSON"] + assert len(json_cols) == 1 + assert json_cols[0][2] == 5 + + +def test_get_bq_schema_all_fixed_types_possible(): + # Sum of all fixed types: + # BOOL 1, INT64 8, FLOAT64 8, NUMERIC 16, DATE 8, DATETIME 8, TIMESTAMP 8, TIME 8 + # Total = 1+8+8+16+8+8+8+8 = 65 + target_size = 65 + schema = get_bq_schema(target_size) + + expected_fixed_types = { + "BOOL", + "INT64", + "FLOAT64", + "NUMERIC", + "DATE", + "DATETIME", + "TIMESTAMP", + "TIME", + } + present_types = {s[1] for s in schema} + + assert expected_fixed_types.issubset(present_types) + + # Check if the size is close to target. + # All fixed (65) + calculated_size = _calculate_row_size(schema) + assert calculated_size == target_size + + +def test_get_bq_schema_uniqueness_of_column_names(): + target_size = 100 # A size that generates multiple columns + schema = get_bq_schema(target_size) + + column_names = [s[0] for s in schema] + assert len(column_names) == len(set(column_names)) + + +# --- Tests for generate_work_items --- + + +def test_generate_work_items_zero_rows(): + schema = [("col_int", "INT64", None)] + data_generator = generate_work_items( + "some_table", schema, num_rows=0, batch_size=10 + ) + + # Expect the generator to be exhausted + with pytest.raises(StopIteration): + next(data_generator) + + +def test_generate_work_items_basic_schema_and_batching(): + schema = [("id", "INT64", None), ("is_active", "BOOL", None)] + num_rows = 25 + batch_size = 10 + + generated_rows_count = 0 + batch_count = 0 + for work_item in generate_work_items("some_table", schema, num_rows, batch_size): + table_id, schema_def, num_rows_in_batch = work_item + assert table_id == "some_table" + assert schema_def == schema + assert num_rows_in_batch <= num_rows + assert num_rows_in_batch <= batch_size + batch_count += 1 + generated_rows_count += num_rows_in_batch + + assert generated_rows_count == num_rows + assert batch_count == math.ceil(num_rows / batch_size) # 25/10 = 2.5 -> 3 batches + + +def test_generate_work_items_batch_size_larger_than_num_rows(): + schema = [("value", "FLOAT64", None)] + num_rows = 5 + batch_size = 100 + + generated_rows_count = 0 + batch_count = 0 + for work_item in generate_work_items("some_table", schema, num_rows, batch_size): + table_id, schema_def, num_rows_in_batch = work_item + assert table_id == "some_table" + assert schema_def == schema + assert num_rows_in_batch == num_rows # Should be one batch with all rows + batch_count += 1 + generated_rows_count += num_rows_in_batch + + assert generated_rows_count == num_rows + assert batch_count == 1 + + +def test_generate_work_items_all_datatypes(rng): + schema = [ + ("c_bool", "BOOL", None), + ("c_int64", "INT64", None), + ("c_float64", "FLOAT64", None), + ("c_numeric", "NUMERIC", None), + ("c_date", "DATE", None), + ("c_datetime", "DATETIME", None), + ("c_timestamp", "TIMESTAMP", None), + ("c_time", "TIME", None), + ("c_string", "STRING", 10), + ("c_bytes", "BYTES", 5), + ("c_json", "JSON", 20), # Length for JSON is content hint + ] + num_rows = 3 + batch_size = 2 # To test multiple batches + + total_rows_processed = 0 + for work_item in generate_work_items("some_table", schema, num_rows, batch_size): + table_id, schema_def, num_rows_in_batch = work_item + assert table_id == "some_table" + assert schema_def == schema + assert num_rows_in_batch <= batch_size + assert num_rows_in_batch <= num_rows + + total_rows_processed += num_rows_in_batch + + assert total_rows_processed == num_rows + + +# --- Pytest Fixture for RNG --- +@pytest.fixture +def rng(): + return np.random.default_rng(seed=42) + + +def test_generate_batch_basic_schema(rng): + schema = [("id", "INT64", None), ("is_active", "BOOL", None)] + batch = generate_batch(schema, 5, rng) + + assert len(batch) == 5 + + for row in batch: + assert isinstance(row, dict) + assert "id" in row + assert "is_active" in row + assert isinstance(row["id"], int) + assert isinstance(row["is_active"], bool) + + +def test_generate_batch_all_datatypes(rng): + schema = [ + ("c_bool", "BOOL", None), + ("c_int64", "INT64", None), + ("c_float64", "FLOAT64", None), + ("c_numeric", "NUMERIC", None), + ("c_date", "DATE", None), + ("c_datetime", "DATETIME", None), + ("c_timestamp", "TIMESTAMP", None), + ("c_time", "TIME", None), + ("c_string", "STRING", 10), + ("c_bytes", "BYTES", 5), + ("c_json", "JSON", 20), # Length for JSON is content hint + ] + num_rows = 3 + + date_pattern = re.compile(r"^\d{4}-\d{2}-\d{2}$") + time_pattern = re.compile(r"^\d{2}:\d{2}:\d{2}(\.\d{1,6})?$") + # BQ DATETIME: YYYY-MM-DD HH:MM:SS.ffffff + datetime_pattern = re.compile(r"^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}(\.\d{1,6})?$") + # BQ TIMESTAMP (UTC 'Z'): YYYY-MM-DDTHH:MM:SS.ffffffZ + timestamp_pattern = re.compile( + r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(\.\d{1,6})?Z$" + ) + numeric_pattern = re.compile(r"^-?\d+\.\d{9}$") + + batch = generate_batch(schema, num_rows, rng) + assert len(batch) == num_rows + + for row in batch: + assert isinstance(row["c_bool"], bool) + assert isinstance(row["c_int64"], int) + assert isinstance(row["c_float64"], float) + + assert isinstance(row["c_numeric"], str) + assert numeric_pattern.match(row["c_numeric"]) + + assert isinstance(row["c_date"], str) + assert date_pattern.match(row["c_date"]) + datetime.date.fromisoformat(row["c_date"]) # Check parsable + + assert isinstance(row["c_datetime"], str) + assert datetime_pattern.match(row["c_datetime"]) + datetime.datetime.fromisoformat(row["c_datetime"]) # Check parsable + + assert isinstance(row["c_timestamp"], str) + assert timestamp_pattern.match(row["c_timestamp"]) + # datetime.fromisoformat can parse 'Z' if Python >= 3.11, or needs replace('Z', '+00:00') + dt_obj = datetime.datetime.fromisoformat( + row["c_timestamp"].replace("Z", "+00:00") + ) + assert dt_obj.tzinfo == datetime.timezone.utc + + assert isinstance(row["c_time"], str) + assert time_pattern.match(row["c_time"]) + datetime.time.fromisoformat(row["c_time"]) # Check parsable + + assert isinstance(row["c_string"], str) + assert len(row["c_string"]) == 10 + + c_bytes = base64.b64decode(row["c_bytes"]) + assert isinstance(c_bytes, bytes) + assert len(c_bytes) == 5 + + assert isinstance(row["c_json"], str) + try: + json.loads(row["c_json"]) # Check if it's valid JSON + except json.JSONDecodeError: + pytest.fail(f"Invalid JSON string generated: {row['c_json']}") + # Note: Exact length check for JSON is hard due to content variability and escaping. + # The 'length' parameter for JSON in schema is a hint for content size. + # We are primarily testing that it's valid JSON. diff --git a/scripts/readme-gen/readme_gen.py b/scripts/readme-gen/readme_gen.py index 8f5e248a0d..ceb1eada7c 100644 --- a/scripts/readme-gen/readme_gen.py +++ b/scripts/readme-gen/readme_gen.py @@ -24,7 +24,6 @@ import jinja2 import yaml - jinja_env = jinja2.Environment( trim_blocks=True, loader=jinja2.FileSystemLoader( From 4da333eb5fa70537f6cf30c437330373f2d748f5 Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Wed, 25 Jun 2025 16:22:39 -0700 Subject: [PATCH 12/28] test: Enable floordiv local testing (#1856) --- bigframes/core/compile/polars/compiler.py | 4 ++ bigframes/core/compile/polars/lowering.py | 47 +++++++++++++++++++ bigframes/core/expression.py | 26 ++++++++++- bigframes/core/nodes.py | 37 ++++++++++++--- bigframes/core/rewrite/op_lowering.py | 57 +++++++++++++++++++++++ tests/unit/test_dataframe_polars.py | 12 ++--- 6 files changed, 168 insertions(+), 15 deletions(-) create mode 100644 bigframes/core/compile/polars/lowering.py create mode 100644 bigframes/core/rewrite/op_lowering.py diff --git a/bigframes/core/compile/polars/compiler.py b/bigframes/core/compile/polars/compiler.py index 62654c1518..cc007623e1 100644 --- a/bigframes/core/compile/polars/compiler.py +++ b/bigframes/core/compile/polars/compiler.py @@ -23,9 +23,11 @@ import bigframes.core from bigframes.core import identifiers, nodes, ordering, window_spec +from bigframes.core.compile.polars import lowering import bigframes.core.expression as ex import bigframes.core.guid as guid import bigframes.core.rewrite +import bigframes.core.rewrite.schema_binding import bigframes.dtypes import bigframes.operations as ops import bigframes.operations.aggregations as agg_ops @@ -403,6 +405,8 @@ def compile(self, array_value: bigframes.core.ArrayValue) -> pl.LazyFrame: node = bigframes.core.rewrite.column_pruning(node) node = nodes.bottom_up(node, bigframes.core.rewrite.rewrite_slice) node = bigframes.core.rewrite.pull_out_window_order(node) + node = bigframes.core.rewrite.schema_binding.bind_schema_to_tree(node) + node = lowering.lower_ops_to_polars(node) return self.compile_node(node) @functools.singledispatchmethod diff --git a/bigframes/core/compile/polars/lowering.py b/bigframes/core/compile/polars/lowering.py new file mode 100644 index 0000000000..88e2d6e599 --- /dev/null +++ b/bigframes/core/compile/polars/lowering.py @@ -0,0 +1,47 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from bigframes import dtypes +from bigframes.core import bigframe_node, expression +from bigframes.core.rewrite import op_lowering +from bigframes.operations import numeric_ops +import bigframes.operations as ops + +# TODO: Would be more precise to actually have separate op set for polars ops (where they diverge from the original ops) + + +class LowerFloorDivRule(op_lowering.OpLoweringRule): + @property + def op(self) -> type[ops.ScalarOp]: + return numeric_ops.FloorDivOp + + def lower(self, expr: expression.OpExpression) -> expression.Expression: + dividend = expr.children[0] + divisor = expr.children[1] + using_floats = (dividend.output_type == dtypes.FLOAT_DTYPE) or ( + divisor.output_type == dtypes.FLOAT_DTYPE + ) + inf_or_zero = ( + expression.const(float("INF")) if using_floats else expression.const(0) + ) + zero_result = ops.mul_op.as_expr(inf_or_zero, dividend) + divisor_is_zero = ops.eq_op.as_expr(divisor, expression.const(0)) + return ops.where_op.as_expr(zero_result, divisor_is_zero, expr) + + +POLARS_LOWERING_RULES = (LowerFloorDivRule(),) + + +def lower_ops_to_polars(root: bigframe_node.BigFrameNode) -> bigframe_node.BigFrameNode: + return op_lowering.lower_ops(root, rules=POLARS_LOWERING_RULES) diff --git a/bigframes/core/expression.py b/bigframes/core/expression.py index 238b588fea..40ba70c555 100644 --- a/bigframes/core/expression.py +++ b/bigframes/core/expression.py @@ -19,7 +19,7 @@ import functools import itertools import typing -from typing import Generator, Mapping, TypeVar, Union +from typing import Callable, Generator, Mapping, TypeVar, Union import pandas as pd @@ -249,6 +249,10 @@ def is_identity(self) -> bool: """True for identity operation that does not transform input.""" return False + @abc.abstractmethod + def transform_children(self, t: Callable[[Expression], Expression]) -> Expression: + ... + def walk(self) -> Generator[Expression, None, None]: yield self for child in self.children: @@ -311,6 +315,9 @@ def __eq__(self, other): return self.value == other.value and self.dtype == other.dtype + def transform_children(self, t: Callable[[Expression], Expression]) -> Expression: + return self + @dataclasses.dataclass(frozen=True) class UnboundVariableExpression(Expression): @@ -362,6 +369,9 @@ def is_bijective(self) -> bool: def is_identity(self) -> bool: return True + def transform_children(self, t: Callable[[Expression], Expression]) -> Expression: + return self + @dataclasses.dataclass(frozen=True) class DerefOp(Expression): @@ -414,6 +424,9 @@ def is_bijective(self) -> bool: def is_identity(self) -> bool: return True + def transform_children(self, t: Callable[[Expression], Expression]) -> Expression: + return self + @dataclasses.dataclass(frozen=True) class SchemaFieldRefExpression(Expression): @@ -463,12 +476,15 @@ def is_bijective(self) -> bool: def is_identity(self) -> bool: return True + def transform_children(self, t: Callable[[Expression], Expression]) -> Expression: + return self + @dataclasses.dataclass(frozen=True) class OpExpression(Expression): """An expression representing a scalar operation applied to 1 or more argument sub-expressions.""" - op: bigframes.operations.RowOp + op: bigframes.operations.ScalarOp inputs: typing.Tuple[Expression, ...] @property @@ -553,6 +569,12 @@ def deterministic(self) -> bool: all(input.deterministic for input in self.inputs) and self.op.deterministic ) + def transform_children(self, t: Callable[[Expression], Expression]) -> Expression: + new_inputs = tuple(t(input) for input in self.inputs) + if new_inputs != self.inputs: + return dataclasses.replace(self, inputs=new_inputs) + return self + def bind_schema_fields( expr: Expression, field_by_id: Mapping[ids.ColumnId, field.Field] diff --git a/bigframes/core/nodes.py b/bigframes/core/nodes.py index 38becd29df..205621fee2 100644 --- a/bigframes/core/nodes.py +++ b/bigframes/core/nodes.py @@ -1008,6 +1008,14 @@ def referenced_ids(self) -> COLUMN_SET: def _node_expressions(self): return (self.predicate,) + def transform_exprs( + self, fn: Callable[[ex.Expression], ex.Expression] + ) -> FilterNode: + return dataclasses.replace( + self, + predicate=fn(self.predicate), + ) + def remap_vars( self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId] ) -> FilterNode: @@ -1066,6 +1074,20 @@ def referenced_ids(self) -> COLUMN_SET: def _node_expressions(self): return tuple(map(lambda x: x.scalar_expression, self.by)) + def transform_exprs( + self, fn: Callable[[ex.Expression], ex.Expression] + ) -> OrderByNode: + new_by = cast( + tuple[OrderingExpression, ...], + tuple( + dataclasses.replace( + by_expr, scalar_expression=fn(by_expr.scalar_expression) + ) + for by_expr in self.by + ), + ) + return dataclasses.replace(self, by=new_by) + def remap_vars( self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId] ) -> OrderByNode: @@ -1078,14 +1100,9 @@ def remap_refs( itertools.chain.from_iterable(map(lambda x: x.referenced_columns, self.by)) ) ref_mapping = {id: ex.DerefOp(mappings[id]) for id in all_refs} - new_by = cast( - tuple[OrderingExpression, ...], - tuple( - by_expr.bind_refs(ref_mapping, allow_partial_bindings=True) - for by_expr in self.by - ), + return self.transform_exprs( + lambda ex: ex.bind_refs(ref_mapping, allow_partial_bindings=True) ) - return dataclasses.replace(self, by=new_by) @dataclasses.dataclass(frozen=True, eq=False) @@ -1293,6 +1310,12 @@ def _node_expressions(self): def additive_base(self) -> BigFrameNode: return self.child + def transform_exprs( + self, fn: Callable[[ex.Expression], ex.Expression] + ) -> ProjectionNode: + new_fields = tuple((fn(ex), id) for ex, id in self.assignments) + return dataclasses.replace(self, assignments=new_fields) + def replace_additive_base(self, node: BigFrameNode) -> ProjectionNode: return dataclasses.replace(self, child=node) diff --git a/bigframes/core/rewrite/op_lowering.py b/bigframes/core/rewrite/op_lowering.py new file mode 100644 index 0000000000..a64a4cc8c4 --- /dev/null +++ b/bigframes/core/rewrite/op_lowering.py @@ -0,0 +1,57 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +import abc +from typing import Sequence + +from bigframes.core import bigframe_node, expression, nodes +import bigframes.operations as ops + + +class OpLoweringRule(abc.ABC): + @property + @abc.abstractmethod + def op(self) -> type[ops.ScalarOp]: + ... + + @abc.abstractmethod + def lower(self, expr: expression.OpExpression) -> expression.Expression: + ... + + +def lower_ops( + root: bigframe_node.BigFrameNode, rules: Sequence[OpLoweringRule] +) -> bigframe_node.BigFrameNode: + rules_by_op = {rule.op: rule for rule in rules} + + def lower_expr(expr: expression.Expression): + def lower_expr_step(expr: expression.Expression) -> expression.Expression: + if isinstance(expr, expression.OpExpression): + maybe_rule = rules_by_op.get(expr.op.__class__) + if maybe_rule: + return maybe_rule.lower(expr) + return expr + + return lower_expr_step(expr.transform_children(lower_expr_step)) + + def lower_node(node: bigframe_node.BigFrameNode) -> bigframe_node.BigFrameNode: + if isinstance( + node, (nodes.ProjectionNode, nodes.FilterNode, nodes.OrderByNode) + ): + return node.transform_exprs(lower_expr) + else: + return node + + return root.bottom_up(lower_node) diff --git a/tests/unit/test_dataframe_polars.py b/tests/unit/test_dataframe_polars.py index b434e473e9..f7f0cc80bb 100644 --- a/tests/unit/test_dataframe_polars.py +++ b/tests/unit/test_dataframe_polars.py @@ -2150,7 +2150,7 @@ def test_df_corrwith_series(scalars_dfs): operator.sub, operator.mul, operator.truediv, - # operator.floordiv, + operator.floordiv, operator.eq, operator.ne, operator.gt, @@ -2163,7 +2163,7 @@ def test_df_corrwith_series(scalars_dfs): "subtract", "multiply", "true_divide", - # "floor_divide", + "floor_divide", "eq", "ne", "gt", @@ -2217,8 +2217,8 @@ def test_scalar_binop_str_exception(scalars_dfs): (lambda x, y: x.rmul(y, axis="index")), (lambda x, y: x.truediv(y, axis="index")), (lambda x, y: x.rtruediv(y, axis="index")), - # (lambda x, y: x.floordiv(y, axis="index")), - # (lambda x, y: x.floordiv(y, axis="index")), + (lambda x, y: x.floordiv(y, axis="index")), + (lambda x, y: x.floordiv(y, axis="index")), (lambda x, y: x.gt(y, axis="index")), (lambda x, y: x.ge(y, axis="index")), (lambda x, y: x.lt(y, axis="index")), @@ -2233,8 +2233,8 @@ def test_scalar_binop_str_exception(scalars_dfs): "rmul", "truediv", "rtruediv", - # "floordiv", - # "rfloordiv", + "floordiv", + "rfloordiv", "gt", "ge", "lt", From 9fb3cb444607df6736d383a2807059bca470c453 Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Thu, 26 Jun 2025 07:26:41 -0700 Subject: [PATCH 13/28] fix: revert dict back to protobuf in the iam binding update (#1838) * fix: update the iam binding update logic This is needed as some recent updates to a dependency (google-cloud-resource-manager or its dependencies) broke the existing logic, and we are seeing the error like this: --> policy.bindings.append(new_binding) TypeError: Expected a message object, but got {'role': 'roles/run.invoker', 'members': [...]} * use right import * add unit test mocking the iam update method * use mock bq connection client * import module rephrasing --- bigframes/clients.py | 6 ++---- tests/unit/test_clients.py | 28 ++++++++++++++++++++++++++++ 2 files changed, 30 insertions(+), 4 deletions(-) diff --git a/bigframes/clients.py b/bigframes/clients.py index f1f6d686fd..e6ddd5c6cb 100644 --- a/bigframes/clients.py +++ b/bigframes/clients.py @@ -24,6 +24,7 @@ import google.api_core.exceptions import google.api_core.retry from google.cloud import bigquery_connection_v1, resourcemanager_v3 +from google.iam.v1 import policy_pb2 logger = logging.getLogger(__name__) @@ -172,10 +173,7 @@ def _ensure_iam_binding( return # Create a new binding - new_binding = { - "role": role, - "members": [service_account], - } # Use a dictionary to avoid problematic google.iam namespace package. + new_binding = policy_pb2.Binding(role=role, members=[service_account]) policy.bindings.append(new_binding) request = { "resource": project, diff --git a/tests/unit/test_clients.py b/tests/unit/test_clients.py index 032512c26e..9daa759838 100644 --- a/tests/unit/test_clients.py +++ b/tests/unit/test_clients.py @@ -12,6 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. +from unittest import mock + +from google.cloud import bigquery_connection_v1, resourcemanager_v3 +from google.iam.v1 import policy_pb2 import pytest from bigframes import clients @@ -65,3 +69,27 @@ def test_get_canonical_bq_connection_id_invalid_path(): default_project="default-project", default_location="us", ) + + +def test_ensure_iam_binding(): + bq_connection_client = mock.create_autospec( + bigquery_connection_v1.ConnectionServiceClient, instance=True + ) + resource_manager_client = mock.create_autospec( + resourcemanager_v3.ProjectsClient, instance=True + ) + resource_manager_client.get_iam_policy.return_value = policy_pb2.Policy( + bindings=[ + policy_pb2.Binding( + role="roles/test.role1", members=["serviceAccount:serviceAccount1"] + ) + ] + ) + bq_connection_manager = clients.BqConnectionManager( + bq_connection_client, resource_manager_client + ) + bq_connection_manager._IAM_WAIT_SECONDS = 0 # no need to wait in test + bq_connection_manager._ensure_iam_binding( + "test-project", "serviceAccount2", "roles/test.role2" + ) + resource_manager_client.set_iam_policy.assert_called_once() From daf0c3b349fb1e85e7070c54a2d3f5460f5e40c9 Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Thu, 26 Jun 2025 10:57:40 -0700 Subject: [PATCH 14/28] feat: Add experimental polars execution (#1747) --- bigframes/_config/bigquery_options.py | 24 +++++++ bigframes/_importing.py | 30 ++++++++ bigframes/core/compile/polars/compiler.py | 4 +- bigframes/session/__init__.py | 1 + bigframes/session/bq_caching_executor.py | 24 +++++-- bigframes/session/polars_executor.py | 2 +- bigframes/testing/polars_session.py | 4 +- noxfile.py | 2 +- testing/constraints-3.10.txt | 1 + tests/system/small/test_polars_execution.py | 76 +++++++++++++++++++++ 10 files changed, 157 insertions(+), 11 deletions(-) create mode 100644 bigframes/_importing.py create mode 100644 tests/system/small/test_polars_execution.py diff --git a/bigframes/_config/bigquery_options.py b/bigframes/_config/bigquery_options.py index d591ea85b3..09ffee95d4 100644 --- a/bigframes/_config/bigquery_options.py +++ b/bigframes/_config/bigquery_options.py @@ -22,6 +22,7 @@ import google.auth.credentials import requests.adapters +import bigframes._importing import bigframes.enums import bigframes.exceptions as bfe @@ -94,6 +95,7 @@ def __init__( requests_transport_adapters: Sequence[ Tuple[str, requests.adapters.BaseAdapter] ] = (), + enable_polars_execution: bool = False, ): self._credentials = credentials self._project = project @@ -113,6 +115,9 @@ def __init__( client_endpoints_override = {} self._client_endpoints_override = client_endpoints_override + if enable_polars_execution: + bigframes._importing.import_polars() + self._enable_polars_execution = enable_polars_execution @property def application_name(self) -> Optional[str]: @@ -424,3 +429,22 @@ def requests_transport_adapters( SESSION_STARTED_MESSAGE.format(attribute="requests_transport_adapters") ) self._requests_transport_adapters = value + + @property + def enable_polars_execution(self) -> bool: + """If True, will use polars to execute some simple query plans locally.""" + return self._enable_polars_execution + + @enable_polars_execution.setter + def enable_polars_execution(self, value: bool): + if self._session_started and self._enable_polars_execution != value: + raise ValueError( + SESSION_STARTED_MESSAGE.format(attribute="enable_polars_execution") + ) + if value is True: + msg = bfe.format_message( + "Polars execution is an experimental feature, and may not be stable. Must have polars installed." + ) + warnings.warn(msg, category=bfe.PreviewWarning) + bigframes._importing.import_polars() + self._enable_polars_execution = value diff --git a/bigframes/_importing.py b/bigframes/_importing.py new file mode 100644 index 0000000000..095a1d9c51 --- /dev/null +++ b/bigframes/_importing.py @@ -0,0 +1,30 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import importlib +from types import ModuleType + +from packaging import version + +# Keep this in sync with setup.py +POLARS_MIN_VERSION = version.Version("1.7.0") + + +def import_polars() -> ModuleType: + polars_module = importlib.import_module("polars") + imported_version = version.Version(polars_module.build_info()["version"]) + if imported_version < POLARS_MIN_VERSION: + raise ImportError( + f"Imported polars version: {imported_version} is below the minimum version: {POLARS_MIN_VERSION}" + ) + return polars_module diff --git a/bigframes/core/compile/polars/compiler.py b/bigframes/core/compile/polars/compiler.py index cc007623e1..6b76f3f53d 100644 --- a/bigframes/core/compile/polars/compiler.py +++ b/bigframes/core/compile/polars/compiler.py @@ -393,7 +393,7 @@ class PolarsCompiler: expr_compiler = PolarsExpressionCompiler() agg_compiler = PolarsAggregateCompiler() - def compile(self, array_value: bigframes.core.ArrayValue) -> pl.LazyFrame: + def compile(self, plan: nodes.BigFrameNode) -> pl.LazyFrame: if not polars_installed: raise ValueError( "Polars is not installed, cannot compile to polars engine." @@ -401,7 +401,7 @@ def compile(self, array_value: bigframes.core.ArrayValue) -> pl.LazyFrame: # TODO: Create standard way to configure BFET -> BFET rewrites # Polars has incomplete slice support in lazy mode - node = array_value.node + node = plan node = bigframes.core.rewrite.column_pruning(node) node = nodes.bottom_up(node, bigframes.core.rewrite.rewrite_slice) node = bigframes.core.rewrite.pull_out_window_order(node) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 13db6823c1..8cbcf8612e 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -255,6 +255,7 @@ def __init__( storage_manager=self._temp_storage_manager, strictly_ordered=self._strictly_ordered, metrics=self._metrics, + enable_polars_execution=context.enable_polars_execution, ) def __del__(self): diff --git a/bigframes/session/bq_caching_executor.py b/bigframes/session/bq_caching_executor.py index 9ad8da33a8..6750652bc2 100644 --- a/bigframes/session/bq_caching_executor.py +++ b/bigframes/session/bq_caching_executor.py @@ -41,7 +41,13 @@ import bigframes.core.tree_properties as tree_properties import bigframes.dtypes import bigframes.features -from bigframes.session import executor, loader, local_scan_executor, read_api_execution +from bigframes.session import ( + executor, + loader, + local_scan_executor, + read_api_execution, + semi_executor, +) import bigframes.session._io.bigquery as bq_io import bigframes.session.metrics import bigframes.session.planner @@ -147,6 +153,7 @@ def __init__( *, strictly_ordered: bool = True, metrics: Optional[bigframes.session.metrics.ExecutionMetrics] = None, + enable_polars_execution: bool = False, ): self.bqclient = bqclient self.storage_manager = storage_manager @@ -155,14 +162,21 @@ def __init__( self.metrics = metrics self.loader = loader self.bqstoragereadclient = bqstoragereadclient - # Simple left-to-right precedence for now - self._semi_executors = ( + self._enable_polars_execution = enable_polars_execution + self._semi_executors: Sequence[semi_executor.SemiExecutor] = ( read_api_execution.ReadApiSemiExecutor( bqstoragereadclient=bqstoragereadclient, project=self.bqclient.project, ), local_scan_executor.LocalScanExecutor(), ) + if enable_polars_execution: + from bigframes.session import polars_executor + + self._semi_executors = ( + *self._semi_executors, + polars_executor.PolarsExecutor(), + ) self._upload_lock = threading.Lock() def to_sql( @@ -637,8 +651,8 @@ def _execute_plan( """Just execute whatever plan as is, without further caching or decomposition.""" # First try to execute fast-paths if not output_spec.require_bq_table: - for semi_executor in self._semi_executors: - maybe_result = semi_executor.execute(plan, ordered=ordered, peek=peek) + for exec in self._semi_executors: + maybe_result = exec.execute(plan, ordered=ordered, peek=peek) if maybe_result: return maybe_result diff --git a/bigframes/session/polars_executor.py b/bigframes/session/polars_executor.py index b2f7f5ccd6..8d415032fb 100644 --- a/bigframes/session/polars_executor.py +++ b/bigframes/session/polars_executor.py @@ -73,7 +73,7 @@ def execute( # Note: Ignoring ordered flag, as just executing totally ordered is fine. try: lazy_frame: pl.LazyFrame = self._compiler.compile( - array_value.ArrayValue(plan) + array_value.ArrayValue(plan).node ) except Exception: return None diff --git a/bigframes/testing/polars_session.py b/bigframes/testing/polars_session.py index 7b898a9f00..3710c40eae 100644 --- a/bigframes/testing/polars_session.py +++ b/bigframes/testing/polars_session.py @@ -41,7 +41,7 @@ def peek( """ A 'peek' efficiently accesses a small number of rows in the dataframe. """ - lazy_frame: polars.LazyFrame = self.compiler.compile(array_value) + lazy_frame: polars.LazyFrame = self.compiler.compile(array_value.node) pa_table = lazy_frame.collect().limit(n_rows).to_arrow() # Currently, pyarrow types might not quite be exactly the ones in the bigframes schema. # Nullability may be different, and might use large versions of list, string datatypes. @@ -64,7 +64,7 @@ def execute( """ Execute the ArrayValue, storing the result to a temporary session-owned table. """ - lazy_frame: polars.LazyFrame = self.compiler.compile(array_value) + lazy_frame: polars.LazyFrame = self.compiler.compile(array_value.node) pa_table = lazy_frame.collect().to_arrow() # Currently, pyarrow types might not quite be exactly the ones in the bigframes schema. # Nullability may be different, and might use large versions of list, string datatypes. diff --git a/noxfile.py b/noxfile.py index 9346f26cba..96b59d6776 100644 --- a/noxfile.py +++ b/noxfile.py @@ -108,7 +108,7 @@ SYSTEM_TEST_EXTRAS: List[str] = [] SYSTEM_TEST_EXTRAS_BY_PYTHON: Dict[str, List[str]] = { "3.9": ["tests", "anywidget"], - "3.10": ["tests"], + "3.10": ["tests", "polars"], "3.12": ["tests", "scikit-learn", "polars", "anywidget"], "3.13": ["tests", "polars"], } diff --git a/testing/constraints-3.10.txt b/testing/constraints-3.10.txt index b11ab5a88d..12ad443aab 100644 --- a/testing/constraints-3.10.txt +++ b/testing/constraints-3.10.txt @@ -15,3 +15,4 @@ matplotlib==3.7.1 psutil==5.9.5 seaborn==0.13.1 traitlets==5.7.1 +polars==1.7.0 diff --git a/tests/system/small/test_polars_execution.py b/tests/system/small/test_polars_execution.py new file mode 100644 index 0000000000..0aed693b80 --- /dev/null +++ b/tests/system/small/test_polars_execution.py @@ -0,0 +1,76 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import pytest + +import bigframes +from bigframes.testing.utils import assert_pandas_df_equal + +polars = pytest.importorskip("polars", reason="polars is required for this test") + + +@pytest.fixture(scope="module") +def session_w_polars(): + context = bigframes.BigQueryOptions(location="US", enable_polars_execution=True) + session = bigframes.Session(context=context) + yield session + session.close() # close generated session at cleanup time + + +def test_polar_execution_sorted(session_w_polars, scalars_pandas_df_index): + execution_count_before = session_w_polars._metrics.execution_count + bf_df = session_w_polars.read_pandas(scalars_pandas_df_index) + + pd_result = scalars_pandas_df_index.sort_index(ascending=False)[ + ["int64_too", "bool_col"] + ] + bf_result = bf_df.sort_index(ascending=False)[["int64_too", "bool_col"]].to_pandas() + + assert session_w_polars._metrics.execution_count == execution_count_before + assert_pandas_df_equal(bf_result, pd_result) + + +def test_polar_execution_sorted_filtered(session_w_polars, scalars_pandas_df_index): + execution_count_before = session_w_polars._metrics.execution_count + bf_df = session_w_polars.read_pandas(scalars_pandas_df_index) + + pd_result = scalars_pandas_df_index.sort_index(ascending=False).dropna( + subset=["int64_col", "string_col"] + ) + bf_result = ( + bf_df.sort_index(ascending=False) + .dropna(subset=["int64_col", "string_col"]) + .to_pandas() + ) + + # Filter and isnull not supported by polar engine yet, so falls back to bq execution + assert session_w_polars._metrics.execution_count == (execution_count_before + 1) + assert_pandas_df_equal(bf_result, pd_result) + + +def test_polar_execution_unsupported_sql_fallback( + session_w_polars, scalars_pandas_df_index +): + execution_count_before = session_w_polars._metrics.execution_count + bf_df = session_w_polars.read_pandas(scalars_pandas_df_index) + + pd_df = scalars_pandas_df_index.copy() + pd_df["str_len_col"] = pd_df.string_col.str.len() + pd_result = pd_df + + bf_df["str_len_col"] = bf_df.string_col.str.len() + bf_result = bf_df.to_pandas() + + # str len not supported by polar engine yet, so falls back to bq execution + assert session_w_polars._metrics.execution_count == (execution_count_before + 1) + assert_pandas_df_equal(bf_result, pd_result) From 25684ff60367f49dd318d4677a7438abdc98bff9 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Thu, 26 Jun 2025 12:25:22 -0700 Subject: [PATCH 15/28] feat: support item assignment in series (#1859) * Add item assignment for bigframes dataframe * testcase update --- bigframes/series.py | 4 ++ tests/system/small/test_series.py | 63 +++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+) diff --git a/bigframes/series.py b/bigframes/series.py index ae6cd7b2ad..ebc2913f78 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -1598,6 +1598,10 @@ def __getattr__(self, key: str): else: raise AttributeError(key) + def __setitem__(self, key, value) -> None: + """Set item using direct assignment, delegating to .loc indexer.""" + self.loc[key] = value + def _apply_aggregation( self, op: agg_ops.UnaryAggregateOp | agg_ops.NullaryAggregateOp ) -> Any: diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 6760d63a20..d513b0e780 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -510,6 +510,69 @@ def test_series___getitem___with_default_index(scalars_dfs): assert bf_result == pd_result +@pytest.mark.parametrize( + ("index_col", "key", "value"), + ( + ("int64_too", 2, "new_string_value"), + ("string_col", "Hello, World!", "updated_value"), + ("int64_too", 0, None), + ), +) +def test_series___setitem__(scalars_dfs, index_col, key, value): + col_name = "string_col" + scalars_df, scalars_pandas_df = scalars_dfs + scalars_df = scalars_df.set_index(index_col, drop=False) + scalars_pandas_df = scalars_pandas_df.set_index(index_col, drop=False) + + bf_series = scalars_df[col_name] + pd_series = scalars_pandas_df[col_name].copy() + + bf_series[key] = value + pd_series[key] = value + + pd.testing.assert_series_equal(bf_series.to_pandas(), pd_series) + + +@pytest.mark.parametrize( + ("key", "value"), + ( + (0, 999), + (1, 888), + (0, None), + (-2345, 777), + ), +) +def test_series___setitem___with_int_key_numeric(scalars_dfs, key, value): + col_name = "int64_col" + index_col = "int64_too" + scalars_df, scalars_pandas_df = scalars_dfs + scalars_df = scalars_df.set_index(index_col, drop=False) + scalars_pandas_df = scalars_pandas_df.set_index(index_col, drop=False) + + bf_series = scalars_df[col_name] + pd_series = scalars_pandas_df[col_name].copy() + + bf_series[key] = value + pd_series[key] = value + + pd.testing.assert_series_equal(bf_series.to_pandas(), pd_series) + + +def test_series___setitem___with_default_index(scalars_dfs): + col_name = "float64_col" + key = 2 + value = 123.456 + scalars_df, scalars_pandas_df = scalars_dfs + + bf_series = scalars_df[col_name] + pd_series = scalars_pandas_df[col_name].copy() + + bf_series[key] = value + pd_series[key] = value + + assert bf_series.to_pandas().iloc[key] == pd_series.iloc[key] + + @pytest.mark.parametrize( ("col_name",), ( From 942e66c483c9afbb680a7af56c9e9a76172a33e1 Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Fri, 27 Jun 2025 10:42:36 -0700 Subject: [PATCH 16/28] feat: Add size op support in local engine (#1865) --- bigframes/session/polars_executor.py | 6 +- .../system/small/engines/test_aggregation.py | 82 +++++++++++++++++++ 2 files changed, 87 insertions(+), 1 deletion(-) create mode 100644 tests/system/small/engines/test_aggregation.py diff --git a/bigframes/session/polars_executor.py b/bigframes/session/polars_executor.py index 8d415032fb..24acda35dc 100644 --- a/bigframes/session/polars_executor.py +++ b/bigframes/session/polars_executor.py @@ -20,6 +20,7 @@ from bigframes.core import array_value, bigframe_node, expression, local_data, nodes import bigframes.operations +from bigframes.operations import aggregations as agg_ops from bigframes.session import executor, semi_executor if TYPE_CHECKING: @@ -32,9 +33,11 @@ nodes.ReversedNode, nodes.SelectionNode, nodes.SliceNode, + nodes.AggregateNode, ) _COMPATIBLE_SCALAR_OPS = () +_COMPATIBLE_AGG_OPS = (agg_ops.SizeOp, agg_ops.SizeUnaryOp) def _get_expr_ops(expr: expression.Expression) -> set[bigframes.operations.ScalarOp]: @@ -48,7 +51,8 @@ def _is_node_polars_executable(node: nodes.BigFrameNode): return False for expr in node._node_expressions: if isinstance(expr, expression.Aggregation): - return False + if not type(expr.op) in _COMPATIBLE_AGG_OPS: + return False if isinstance(expr, expression.Expression): if not _get_expr_ops(expr).issubset(_COMPATIBLE_SCALAR_OPS): return False diff --git a/tests/system/small/engines/test_aggregation.py b/tests/system/small/engines/test_aggregation.py new file mode 100644 index 0000000000..2c323a5f28 --- /dev/null +++ b/tests/system/small/engines/test_aggregation.py @@ -0,0 +1,82 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +from bigframes.core import array_value, expression, identifiers, nodes +import bigframes.operations.aggregations as agg_ops +from bigframes.session import polars_executor +from bigframes.testing.engine_utils import assert_equivalence_execution + +pytest.importorskip("polars") + +# Polars used as reference as its fast and local. Generally though, prefer gbq engine where they disagree. +REFERENCE_ENGINE = polars_executor.PolarsExecutor() + + +@pytest.mark.parametrize("engine", ["polars", "bq"], indirect=True) +def test_engines_aggregate_size( + scalars_array_value: array_value.ArrayValue, + engine, +): + node = nodes.AggregateNode( + scalars_array_value.node, + aggregations=( + ( + expression.NullaryAggregation(agg_ops.SizeOp()), + identifiers.ColumnId("size_op"), + ), + ( + expression.UnaryAggregation( + agg_ops.SizeUnaryOp(), expression.deref("string_col") + ), + identifiers.ColumnId("unary_size_op"), + ), + ), + ) + assert_equivalence_execution(node, REFERENCE_ENGINE, engine) + + +@pytest.mark.parametrize("engine", ["polars", "bq"], indirect=True) +@pytest.mark.parametrize( + "grouping_cols", + [ + ["bool_col"], + ["string_col", "int64_col"], + ["date_col"], + ["datetime_col"], + ["timestamp_col"], + ["bytes_col"], + ], +) +def test_engines_grouped_aggregate( + scalars_array_value: array_value.ArrayValue, engine, grouping_cols +): + node = nodes.AggregateNode( + scalars_array_value.node, + aggregations=( + ( + expression.NullaryAggregation(agg_ops.SizeOp()), + identifiers.ColumnId("size_op"), + ), + ( + expression.UnaryAggregation( + agg_ops.SizeUnaryOp(), expression.deref("string_col") + ), + identifiers.ColumnId("unary_size_op"), + ), + ), + by_column_ids=tuple(expression.deref(id) for id in grouping_cols), + ) + assert_equivalence_execution(node, REFERENCE_ENGINE, engine) From c5d251a1d454bb4ef55ea9905faeadd646a23b14 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Fri, 27 Jun 2025 13:03:21 -0700 Subject: [PATCH 17/28] feat: support index item assign in Series (#1868) * implement index item assignment * add testcase * final touch up --- bigframes/core/indexes/base.py | 5 +++++ tests/system/small/test_index.py | 26 ++++++++++++++++++++++++++ 2 files changed, 31 insertions(+) diff --git a/bigframes/core/indexes/base.py b/bigframes/core/indexes/base.py index bc8b47d216..f653b8700b 100644 --- a/bigframes/core/indexes/base.py +++ b/bigframes/core/indexes/base.py @@ -174,6 +174,11 @@ def dtypes(self) -> pandas.Series: index=typing.cast(typing.Tuple, self._block.index.names), ) + def __setitem__(self, key, value) -> None: + """Index objects are immutable. Use Index constructor to create + modified Index.""" + raise TypeError("Index does not support mutable operations") + @property def size(self) -> int: return self.shape[0] diff --git a/tests/system/small/test_index.py b/tests/system/small/test_index.py index 3b9854be26..c7e316a9d2 100644 --- a/tests/system/small/test_index.py +++ b/tests/system/small/test_index.py @@ -499,3 +499,29 @@ def test_index_item_with_empty(session): with pytest.raises(ValueError, match=re.escape(expected_message)): bf_idx_empty.item() + + +@pytest.mark.parametrize( + ("key", "value"), + [ + (0, "string_value"), + (1, 42), + ("label", None), + (-1, 3.14), + ], +) +def test_index_setitem_different_types(scalars_dfs, key, value): + """Tests that custom Index setitem raises TypeError.""" + scalars_df, _ = scalars_dfs + index = scalars_df.index + + with pytest.raises(TypeError, match="Index does not support mutable operations"): + index[key] = value + + +def test_custom_index_setitem_error(): + """Tests that custom Index setitem raises TypeError.""" + custom_index = bpd.Index([1, 2, 3, 4, 5], name="custom") + + with pytest.raises(TypeError, match="Index does not support mutable operations"): + custom_index[2] = 999 From ed75cd99ff373181a6d9576f5cf8efa83bb394db Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Fri, 27 Jun 2025 16:24:46 -0500 Subject: [PATCH 18/28] chore: add benchmarks for read_gbq_colab (#1860) * chore: add benchmarks for read_gbq_colab * correct project id * exclude error too * Delete tests/benchmark/read_gbq_colab/first_page.py_percentile_99.error * explain column selection for groupby --- tests/benchmark/.gitignore | 6 ++ .../read_gbq_colab/aggregate_output.py | 72 +++++++++++++++++++ tests/benchmark/read_gbq_colab/config.jsonl | 10 +++ tests/benchmark/read_gbq_colab/dry_run.py | 48 +++++++++++++ .../benchmark/read_gbq_colab/filter_output.py | 60 ++++++++++++++++ tests/benchmark/read_gbq_colab/first_page.py | 53 ++++++++++++++ tests/benchmark/read_gbq_colab/last_page.py | 54 ++++++++++++++ tests/benchmark/read_gbq_colab/sort_output.py | 64 +++++++++++++++++ tests/benchmark/utils.py | 3 + 9 files changed, 370 insertions(+) create mode 100644 tests/benchmark/.gitignore create mode 100644 tests/benchmark/read_gbq_colab/aggregate_output.py create mode 100644 tests/benchmark/read_gbq_colab/config.jsonl create mode 100644 tests/benchmark/read_gbq_colab/dry_run.py create mode 100644 tests/benchmark/read_gbq_colab/filter_output.py create mode 100644 tests/benchmark/read_gbq_colab/first_page.py create mode 100644 tests/benchmark/read_gbq_colab/last_page.py create mode 100644 tests/benchmark/read_gbq_colab/sort_output.py diff --git a/tests/benchmark/.gitignore b/tests/benchmark/.gitignore new file mode 100644 index 0000000000..f1bf042bf7 --- /dev/null +++ b/tests/benchmark/.gitignore @@ -0,0 +1,6 @@ +*.bytesprocessed +*.bq_exec_time_seconds +*.error +*.local_exec_time_seconds +*.query_char_count +*.slotmillis diff --git a/tests/benchmark/read_gbq_colab/aggregate_output.py b/tests/benchmark/read_gbq_colab/aggregate_output.py new file mode 100644 index 0000000000..b612e2998c --- /dev/null +++ b/tests/benchmark/read_gbq_colab/aggregate_output.py @@ -0,0 +1,72 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import pathlib + +import benchmark.utils as utils + +import bigframes.session + +PAGE_SIZE = utils.READ_GBQ_COLAB_PAGE_SIZE + + +def aggregate_output( + *, project_id, dataset_id, table_id, session: bigframes.session.Session +): + # TODO(tswast): Support alternative query if table_id is a local DataFrame, + # e.g. "{local_inline}" or "{local_large}" + df = session._read_gbq_colab( + f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}" + ) + + # Simulate getting the first page, since we'll always do that first in the UI. + df.shape + next(iter(df.to_pandas_batches(page_size=PAGE_SIZE))) + + # To simulate very small rows that can only fit a boolean, + # some tables don't have an integer column. If an integer column is available, + # we prefer to group by that to get a more realistic number of groups. + group_column = "col_int64_1" + if group_column not in df.columns: + group_column = "col_bool_0" + + # Simulate the user aggregating by a column and visualizing those results + df_aggregated = ( + df.assign(rounded=df[group_column].astype("Int64").round(-9)) + .groupby("rounded") + .sum() + ) + + df_aggregated.shape + next(iter(df_aggregated.to_pandas_batches(page_size=PAGE_SIZE))) + + +if __name__ == "__main__": + ( + project_id, + dataset_id, + table_id, + session, + suffix, + ) = utils.get_configuration(include_table_id=True) + current_path = pathlib.Path(__file__).absolute() + + utils.get_execution_time( + aggregate_output, + current_path, + suffix, + project_id=project_id, + dataset_id=dataset_id, + table_id=table_id, + session=session, + ) diff --git a/tests/benchmark/read_gbq_colab/config.jsonl b/tests/benchmark/read_gbq_colab/config.jsonl new file mode 100644 index 0000000000..6f1ddf4a5f --- /dev/null +++ b/tests/benchmark/read_gbq_colab/config.jsonl @@ -0,0 +1,10 @@ +{"benchmark_suffix": "percentile_09", "project_id": "bigframes-dev-perf", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_09", "ordered": false} +{"benchmark_suffix": "percentile_19", "project_id": "bigframes-dev-perf", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_19", "ordered": false} +{"benchmark_suffix": "percentile_29", "project_id": "bigframes-dev-perf", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_29", "ordered": false} +{"benchmark_suffix": "percentile_39", "project_id": "bigframes-dev-perf", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_39", "ordered": false} +{"benchmark_suffix": "percentile_49", "project_id": "bigframes-dev-perf", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_49", "ordered": false} +{"benchmark_suffix": "percentile_59", "project_id": "bigframes-dev-perf", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_59", "ordered": false} +{"benchmark_suffix": "percentile_69", "project_id": "bigframes-dev-perf", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_69", "ordered": false} +{"benchmark_suffix": "percentile_79", "project_id": "bigframes-dev-perf", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_79", "ordered": false} +{"benchmark_suffix": "percentile_89", "project_id": "bigframes-dev-perf", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_89", "ordered": false} +{"benchmark_suffix": "percentile_99", "project_id": "bigframes-dev-perf", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_99", "ordered": false} diff --git a/tests/benchmark/read_gbq_colab/dry_run.py b/tests/benchmark/read_gbq_colab/dry_run.py new file mode 100644 index 0000000000..c2de1b7cc4 --- /dev/null +++ b/tests/benchmark/read_gbq_colab/dry_run.py @@ -0,0 +1,48 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import pathlib + +import benchmark.utils as utils + +import bigframes.session + + +def dry_run(*, project_id, dataset_id, table_id, session: bigframes.session.Session): + # TODO(tswast): Support alternative query if table_id is a local DataFrame, + # e.g. "{local_inline}" or "{local_large}" + session._read_gbq_colab( + f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}", + dry_run=True, + ) + + +if __name__ == "__main__": + ( + project_id, + dataset_id, + table_id, + session, + suffix, + ) = utils.get_configuration(include_table_id=True) + current_path = pathlib.Path(__file__).absolute() + + utils.get_execution_time( + dry_run, + current_path, + suffix, + project_id=project_id, + dataset_id=dataset_id, + table_id=table_id, + session=session, + ) diff --git a/tests/benchmark/read_gbq_colab/filter_output.py b/tests/benchmark/read_gbq_colab/filter_output.py new file mode 100644 index 0000000000..7945d9f0c6 --- /dev/null +++ b/tests/benchmark/read_gbq_colab/filter_output.py @@ -0,0 +1,60 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import pathlib + +import benchmark.utils as utils + +import bigframes.session + +PAGE_SIZE = utils.READ_GBQ_COLAB_PAGE_SIZE + + +def filter_output( + *, project_id, dataset_id, table_id, session: bigframes.session.Session +): + # TODO(tswast): Support alternative query if table_id is a local DataFrame, + # e.g. "{local_inline}" or "{local_large}" + df = session._read_gbq_colab( + f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}" + ) + + # Simulate getting the first page, since we'll always do that first in the UI. + df.shape + next(iter(df.to_pandas_batches(page_size=PAGE_SIZE))) + + # Simulate the user filtering by a column and visualizing those results + df_filtered = df[df["col_bool_0"]] + df_filtered.shape + next(iter(df_filtered.to_pandas_batches(page_size=PAGE_SIZE))) + + +if __name__ == "__main__": + ( + project_id, + dataset_id, + table_id, + session, + suffix, + ) = utils.get_configuration(include_table_id=True) + current_path = pathlib.Path(__file__).absolute() + + utils.get_execution_time( + filter_output, + current_path, + suffix, + project_id=project_id, + dataset_id=dataset_id, + table_id=table_id, + session=session, + ) diff --git a/tests/benchmark/read_gbq_colab/first_page.py b/tests/benchmark/read_gbq_colab/first_page.py new file mode 100644 index 0000000000..2df9990d22 --- /dev/null +++ b/tests/benchmark/read_gbq_colab/first_page.py @@ -0,0 +1,53 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import pathlib + +import benchmark.utils as utils + +import bigframes.session + +PAGE_SIZE = utils.READ_GBQ_COLAB_PAGE_SIZE + + +def first_page(*, project_id, dataset_id, table_id, session: bigframes.session.Session): + # TODO(tswast): Support alternative query if table_id is a local DataFrame, + # e.g. "{local_inline}" or "{local_large}" + df = session._read_gbq_colab( + f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}" + ) + + # Get number of rows (to calculate number of pages) and the first page. + df.shape + next(iter(df.to_pandas_batches(page_size=PAGE_SIZE))) + + +if __name__ == "__main__": + ( + project_id, + dataset_id, + table_id, + session, + suffix, + ) = utils.get_configuration(include_table_id=True) + current_path = pathlib.Path(__file__).absolute() + + utils.get_execution_time( + first_page, + current_path, + suffix, + project_id=project_id, + dataset_id=dataset_id, + table_id=table_id, + session=session, + ) diff --git a/tests/benchmark/read_gbq_colab/last_page.py b/tests/benchmark/read_gbq_colab/last_page.py new file mode 100644 index 0000000000..ad785a29e8 --- /dev/null +++ b/tests/benchmark/read_gbq_colab/last_page.py @@ -0,0 +1,54 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import pathlib + +import benchmark.utils as utils + +import bigframes.session + +PAGE_SIZE = utils.READ_GBQ_COLAB_PAGE_SIZE + + +def last_page(*, project_id, dataset_id, table_id, session: bigframes.session.Session): + # TODO(tswast): Support alternative query if table_id is a local DataFrame, + # e.g. "{local_inline}" or "{local_large}" + df = session._read_gbq_colab( + f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}" + ) + + # Get number of rows (to calculate number of pages) and then all pages. + df.shape + for _ in df.to_pandas_batches(page_size=PAGE_SIZE): + pass + + +if __name__ == "__main__": + ( + project_id, + dataset_id, + table_id, + session, + suffix, + ) = utils.get_configuration(include_table_id=True) + current_path = pathlib.Path(__file__).absolute() + + utils.get_execution_time( + last_page, + current_path, + suffix, + project_id=project_id, + dataset_id=dataset_id, + table_id=table_id, + session=session, + ) diff --git a/tests/benchmark/read_gbq_colab/sort_output.py b/tests/benchmark/read_gbq_colab/sort_output.py new file mode 100644 index 0000000000..997de5683d --- /dev/null +++ b/tests/benchmark/read_gbq_colab/sort_output.py @@ -0,0 +1,64 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import pathlib + +import benchmark.utils as utils + +import bigframes.session + +PAGE_SIZE = utils.READ_GBQ_COLAB_PAGE_SIZE + + +def sort_output( + *, project_id, dataset_id, table_id, session: bigframes.session.Session +): + # TODO(tswast): Support alternative query if table_id is a local DataFrame, + # e.g. "{local_inline}" or "{local_large}" + df = session._read_gbq_colab( + f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}" + ) + + # Simulate getting the first page, since we'll always do that first in the UI. + df.shape + next(iter(df.to_pandas_batches(page_size=PAGE_SIZE))) + + # Simulate the user sorting by a column and visualizing those results + sort_column = "col_int64_1" + if sort_column not in df.columns: + sort_column = "col_bool_0" + + df_sorted = df.sort_values(sort_column) + df_sorted.shape + next(iter(df_sorted.to_pandas_batches(page_size=PAGE_SIZE))) + + +if __name__ == "__main__": + ( + project_id, + dataset_id, + table_id, + session, + suffix, + ) = utils.get_configuration(include_table_id=True) + current_path = pathlib.Path(__file__).absolute() + + utils.get_execution_time( + sort_output, + current_path, + suffix, + project_id=project_id, + dataset_id=dataset_id, + table_id=table_id, + session=session, + ) diff --git a/tests/benchmark/utils.py b/tests/benchmark/utils.py index 887d54dba2..48357ddde7 100644 --- a/tests/benchmark/utils.py +++ b/tests/benchmark/utils.py @@ -17,6 +17,8 @@ import bigframes +READ_GBQ_COLAB_PAGE_SIZE = 100 + def get_configuration(include_table_id=False): parser = argparse.ArgumentParser() @@ -94,6 +96,7 @@ def _str_to_bool(value): def _initialize_session(ordered: bool): + # TODO(tswast): add a flag to enable the polars semi-executor. context = bigframes.BigQueryOptions( location="US", ordering_mode="strict" if ordered else "partial" ) From cc339e9938129cac896460e3a794b3ec8479fa4a Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Fri, 27 Jun 2025 15:01:34 -0700 Subject: [PATCH 19/28] fix: Fix bug selecting column repeatedly (#1858) --- bigframes/core/array_value.py | 25 ++++++++++++++++++++----- bigframes/core/blocks.py | 5 ++++- tests/system/small/test_dataframe.py | 9 +++++++++ 3 files changed, 33 insertions(+), 6 deletions(-) diff --git a/bigframes/core/array_value.py b/bigframes/core/array_value.py index 4b05781cb7..b47637cb59 100644 --- a/bigframes/core/array_value.py +++ b/bigframes/core/array_value.py @@ -330,12 +330,27 @@ def create_constant( return self.project_to_id(ex.const(value, dtype)) - def select_columns(self, column_ids: typing.Sequence[str]) -> ArrayValue: + def select_columns( + self, column_ids: typing.Sequence[str], allow_renames: bool = False + ) -> ArrayValue: # This basically just drops and reorders columns - logically a no-op except as a final step - selections = ( - bigframes.core.nodes.AliasedRef.identity(ids.ColumnId(col_id)) - for col_id in column_ids - ) + selections = [] + seen = set() + + for id in column_ids: + if id not in seen: + ref = nodes.AliasedRef.identity(ids.ColumnId(id)) + elif allow_renames: + ref = nodes.AliasedRef( + ex.deref(id), ids.ColumnId(bigframes.core.guid.generate_guid()) + ) + else: + raise ValueError( + "Must set allow_renames=True to select columns repeatedly" + ) + selections.append(ref) + seen.add(id) + return ArrayValue( nodes.SelectionNode( child=self.node, diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 675e8c8b7a..1426459912 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -1210,7 +1210,10 @@ def select_column(self, id: str) -> Block: return self.select_columns([id]) def select_columns(self, ids: typing.Sequence[str]) -> Block: - expr = self._expr.select_columns([*self.index_columns, *ids]) + # Allow renames as may end up selecting same columns multiple times + expr = self._expr.select_columns( + [*self.index_columns, *ids], allow_renames=True + ) col_labels = self._get_labels_for_columns(ids) return Block(expr, self.index_columns, col_labels, self.index.names) diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index b037c6f371..d5446efcd0 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -3408,6 +3408,15 @@ def test__dir__with_rename(scalars_dfs): assert "drop" in results +def test_loc_select_columns_w_repeats(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index[["int64_col", "int64_col", "int64_too"]].to_pandas() + pd_result = scalars_pandas_df_index[["int64_col", "int64_col", "int64_too"]] + pd.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + @pytest.mark.parametrize( ("start", "stop", "step"), [ From 6c3f68a757ff12382f5da6c67da5667f9bcb1f8b Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Fri, 27 Jun 2025 15:35:35 -0700 Subject: [PATCH 20/28] upload logo to repo (#1869) --- third_party/logo/colab-logo.png | Bin 0 -> 1539 bytes third_party/logo/github-logo.png | Bin 0 -> 1714 bytes 2 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 third_party/logo/colab-logo.png create mode 100644 third_party/logo/github-logo.png diff --git a/third_party/logo/colab-logo.png b/third_party/logo/colab-logo.png new file mode 100644 index 0000000000000000000000000000000000000000..75740a2b6ae87543bd97bb5a50bb8ac05b5c0d74 GIT binary patch literal 1539 zcmZ`%c~DbF7~hA4m%K+%F?fKqQ8~h>B}7ETi$H)d0s>MMTP5KvR7qY8wKyIr&?@6- zQ%qYgs;DEF5Cp6zoo-wMTzK`gBZc8kPz{8U()G}e{^Qw?)Sd^_IG@{<;h8L zOw1bt05Ic)f>gLh!Y`0M4$iw`TRh<6n#WJ%1JLk2{!UDT|LiPbY9atz0sv5y0Wbq{YC3Q7y1(s4=RhW@|05)dH)U8ZfZI*^!64 zB2Z3jh;pVd$9v`(!pYD4Pq+7 zOax<4I>A6-76nRiIZ2(8YAIM}y{#HC61ah2P$1h;N~BY&28>iYYSh54rCdM(kkKT^ zj4}p9Io{Q9qyV4x?K%M zw7@8*LjeN?4)1k9p9_pqhN%dJydxR_xgd^7hV^cwpeQ36SO5)T=4~jXww^#p$SA}0 zn=s1`l+vjI!w#b1;|M%HK;R?*1_@@6QmUwi{Ab^BLBq$WG6?RRDe+R={1poZ7T6Q$ zqJlCA(O|~ef8!>qV*`cg>$-ohm{KJNCIUAS4E-ji^M0X0M+QEJ>1Wt7+}<8MVk$lU4#=aaB`gAtz9F%#b7#1A zpR0$bvKgaQ(BC|?OzM5TEO*AYOV-X@%%8ErJ!uY>K5J2QL3tk9_h{&m=I1?it|#Z+ zk3Zxy{H$My%x|ym>OFJL5T<)(IRES88fMqMn`moae^*ZP6aUFA?|0+sW>(t8g|UC6 z5o%SIZ-BJ&+|kX!Ya6}{fliRj<#B?xNn)K9{-E;s#28eM(=q?)E3QYi!tK}b0)CqC zT9#-k?KnV1*_pXwaZc9GR|~^0UMSvQY}?g&E0}#@JXTi5cbR#N+j4a3GV$0`PrFj# zSE>r-WyJoZ+`h0l`>mbT;gw>wt5mPdt5ZH*8JAjD6;!IuSpHeul+(x-`-s^GczJsh zK7+k^nFVg$Ule8|*tF%AF}rribpG6f)*Y?dO7~eDuElqH<>AW@*L}{fD17mwv&k*HH9*st~M`Bl^`(5Xxd(Bec+joGEvOHNyTc!N2Q?a3b ztH;leSFiQMi1H{K7ARAJnkEzN&#y(|R$c`Ye-kw-31|>D(JUck@e~ zj&=ioyrg+s+i!O>>Lt3e^2l}j0#+Qj^s&py{#yy1c{c^O_ngVSb6}BV8OB!nOq1!4 zXX|K&@3qCloQIegXTm1hAXz%p9;QxHmKxL%0V&0TRzzznhgyqrIC$F)0{WwLXLrBvd*^wc_uSc%h%m9E z{W5z3f#4_!7RvAyFh6!S_*<8qJ%KOIm?#E|L=rJQq=gB5C6WLG5;c?r%V0>EmEH#X z5eSwPRa6WXBMs#$5H%GtW2go-in9p>zW@UYDNNWc^XOXZQ? z1QjEV00I#$3^1wQUJ8&-2UsjB-G|9y(LDhMNN3PM{APL4eYi{(m*ERcUnJa{R+-3^ z34^A6;U^v`8N*O6ji%S@sd{fJqD`XFIUJ5zgTe5^5nj414F(y!G&=H(f)Lgzv?>%+ zAsWD}2qhpH7>|TU`X&W6IxDNuO_vET7|j5oG&&VDr!)hUO8+0KR?nh!m<)a!?|%yG zqOwq!CWCcIhE{<$E|F|@g>nP6FoYr6C<8>D?ID9%&5J(4oSbR1I^byW*g@__U z4QsF&uJSEcFeleM3~ChjEQGbHOjsGDMbyAl(p=Ttv9RaVo8~I#js@@Y9C^_2U})yn zzSHU%6FxuY?d;&65MyR({^lU*3$z$ZllDb(o&<7d;A_`h2U+3~BJ2Hv`{W}KEU801#cv_B|9Cm!ynR{S`AMsSn z;7E=B;mb!wx$L;S>yGXG^6=&WlQn9$s?&L%Y1D8TI^MlKB1DqsEng$>f4=xYWBoPI z_S1p!sJ#d2?YI4kPA{k}Eby?F=f-J9zIc`YDl^pzjVm~9ebE?Hn?t0Nx+la|D0MB; z9)2xv1G>a1|A9kQ>~DV<=X3-4yC&n!m8-3K#P z{X@0zRuQsy$+N ziSCoLJU{Z$nQy4A4Y5UJ07$5FA~qL2%Q+cLaqDU?Lz3?=BC5;Nk6BbTmmceEaM>-Z zi>O&-dSE=%ex;vcvCOk{*JQ5^_4M z4lW7%l9IqY(z7pV(?I@@8=KPFO82)O{VDI18-*d-k$YmI^XiuPs_LuFw<^ZcD}yP5 c*NrbeloN*74g`U%%F6r~k%+>C^#XapzmV0H-2eap literal 0 HcmV?d00001 From 024be5989044f02f3ea7a50e47b5619893a5182f Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Fri, 27 Jun 2025 15:47:24 -0700 Subject: [PATCH 21/28] refactor: rename scalars_types_df to scalar_types_df (#1862) --- tests/unit/core/compile/sqlglot/conftest.py | 10 +++++----- .../sqlglot/expressions/test_binary_compiler.py | 12 ++++++------ .../unit/core/compile/sqlglot/test_compile_concat.py | 4 ++-- .../unit/core/compile/sqlglot/test_compile_filter.py | 4 ++-- .../core/compile/sqlglot/test_compile_readlocal.py | 4 ++-- .../core/compile/sqlglot/test_compile_readtable.py | 12 ++++++------ 6 files changed, 23 insertions(+), 23 deletions(-) diff --git a/tests/unit/core/compile/sqlglot/conftest.py b/tests/unit/core/compile/sqlglot/conftest.py index 6d5fac8184..645daddd46 100644 --- a/tests/unit/core/compile/sqlglot/conftest.py +++ b/tests/unit/core/compile/sqlglot/conftest.py @@ -46,9 +46,9 @@ def _create_compiler_session(table_name, table_schema): @pytest.fixture(scope="session") -def compiler_session(scalars_types_table_schema): +def compiler_session(scalar_types_table_schema): """Compiler session for scalar types.""" - return _create_compiler_session("scalar_types", scalars_types_table_schema) + return _create_compiler_session("scalar_types", scalar_types_table_schema) @pytest.fixture(scope="session") @@ -72,7 +72,7 @@ def compiler_session_w_json_types(json_types_table_schema): @pytest.fixture(scope="session") -def scalars_types_table_schema() -> typing.Sequence[bigquery.SchemaField]: +def scalar_types_table_schema() -> typing.Sequence[bigquery.SchemaField]: return [ bigquery.SchemaField("bool_col", "BOOLEAN"), bigquery.SchemaField("bytes_col", "BYTES"), @@ -92,7 +92,7 @@ def scalars_types_table_schema() -> typing.Sequence[bigquery.SchemaField]: @pytest.fixture(scope="session") -def scalars_types_df(compiler_session) -> bpd.DataFrame: +def scalar_types_df(compiler_session) -> bpd.DataFrame: """Returns a BigFrames DataFrame containing all scalar types and using the `rowindex` column as the index.""" bf_df = compiler_session.read_gbq_table("bigframes-dev.sqlglot_test.scalar_types") @@ -101,7 +101,7 @@ def scalars_types_df(compiler_session) -> bpd.DataFrame: @pytest.fixture(scope="session") -def scalars_types_pandas_df() -> pd.DataFrame: +def scalar_types_pandas_df() -> pd.DataFrame: """Returns a pandas DataFrame containing all scalar types and using the `rowindex` column as the index.""" # TODO: add tests for empty dataframes diff --git a/tests/unit/core/compile/sqlglot/expressions/test_binary_compiler.py b/tests/unit/core/compile/sqlglot/expressions/test_binary_compiler.py index 180d43d771..f3c96e9253 100644 --- a/tests/unit/core/compile/sqlglot/expressions/test_binary_compiler.py +++ b/tests/unit/core/compile/sqlglot/expressions/test_binary_compiler.py @@ -19,24 +19,24 @@ pytest.importorskip("pytest_snapshot") -def test_add_numeric(scalars_types_df: bpd.DataFrame, snapshot): - bf_df = scalars_types_df[["int64_col"]] +def test_add_numeric(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df[["int64_col"]] bf_df["int64_col"] = bf_df["int64_col"] + bf_df["int64_col"] snapshot.assert_match(bf_df.sql, "out.sql") -def test_add_numeric_w_scalar(scalars_types_df: bpd.DataFrame, snapshot): - bf_df = scalars_types_df[["int64_col"]] +def test_add_numeric_w_scalar(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df[["int64_col"]] bf_df["int64_col"] = bf_df["int64_col"] + 1 snapshot.assert_match(bf_df.sql, "out.sql") -def test_add_string(scalars_types_df: bpd.DataFrame, snapshot): - bf_df = scalars_types_df[["string_col"]] +def test_add_string(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df[["string_col"]] bf_df["string_col"] = bf_df["string_col"] + "a" diff --git a/tests/unit/core/compile/sqlglot/test_compile_concat.py b/tests/unit/core/compile/sqlglot/test_compile_concat.py index ec7e83a4b0..79f73d3113 100644 --- a/tests/unit/core/compile/sqlglot/test_compile_concat.py +++ b/tests/unit/core/compile/sqlglot/test_compile_concat.py @@ -22,11 +22,11 @@ def test_compile_concat( - scalars_types_pandas_df: pd.DataFrame, compiler_session: bigframes.Session, snapshot + scalar_types_pandas_df: pd.DataFrame, compiler_session: bigframes.Session, snapshot ): # TODO: concat two same dataframes, which SQL does not get reused. # TODO: concat dataframes from a gbq table but trigger a windows compiler. - df1 = bpd.DataFrame(scalars_types_pandas_df, session=compiler_session) + df1 = bpd.DataFrame(scalar_types_pandas_df, session=compiler_session) df1 = df1[["rowindex", "int64_col", "string_col"]] concat_df = bpd.concat([df1, df1]) snapshot.assert_match(concat_df.sql, "out.sql") diff --git a/tests/unit/core/compile/sqlglot/test_compile_filter.py b/tests/unit/core/compile/sqlglot/test_compile_filter.py index 03b54f289a..0afb5eb45b 100644 --- a/tests/unit/core/compile/sqlglot/test_compile_filter.py +++ b/tests/unit/core/compile/sqlglot/test_compile_filter.py @@ -19,7 +19,7 @@ pytest.importorskip("pytest_snapshot") -def test_compile_filter(scalars_types_df: bpd.DataFrame, snapshot): - bf_df = scalars_types_df[["rowindex", "int64_col"]] +def test_compile_filter(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df[["rowindex", "int64_col"]] bf_filter = bf_df[bf_df["rowindex"] >= 1] snapshot.assert_match(bf_filter.sql, "out.sql") diff --git a/tests/unit/core/compile/sqlglot/test_compile_readlocal.py b/tests/unit/core/compile/sqlglot/test_compile_readlocal.py index bd27ad450e..7307fd9b4e 100644 --- a/tests/unit/core/compile/sqlglot/test_compile_readlocal.py +++ b/tests/unit/core/compile/sqlglot/test_compile_readlocal.py @@ -22,9 +22,9 @@ def test_compile_readlocal( - scalars_types_pandas_df: pd.DataFrame, compiler_session: bigframes.Session, snapshot + scalar_types_pandas_df: pd.DataFrame, compiler_session: bigframes.Session, snapshot ): - bf_df = bpd.DataFrame(scalars_types_pandas_df, session=compiler_session) + bf_df = bpd.DataFrame(scalar_types_pandas_df, session=compiler_session) snapshot.assert_match(bf_df.sql, "out.sql") diff --git a/tests/unit/core/compile/sqlglot/test_compile_readtable.py b/tests/unit/core/compile/sqlglot/test_compile_readtable.py index d3b5140471..a5692e5fbf 100644 --- a/tests/unit/core/compile/sqlglot/test_compile_readtable.py +++ b/tests/unit/core/compile/sqlglot/test_compile_readtable.py @@ -19,8 +19,8 @@ pytest.importorskip("pytest_snapshot") -def test_compile_readtable(scalars_types_df: bpd.DataFrame, snapshot): - snapshot.assert_match(scalars_types_df.sql, "out.sql") +def test_compile_readtable(scalar_types_df: bpd.DataFrame, snapshot): + snapshot.assert_match(scalar_types_df.sql, "out.sql") def test_compile_readtable_w_repeated_types(repeated_types_df: bpd.DataFrame, snapshot): @@ -37,13 +37,13 @@ def test_compile_readtable_w_json_types(json_types_df: bpd.DataFrame, snapshot): snapshot.assert_match(json_types_df.sql, "out.sql") -def test_compile_readtable_w_ordering(scalars_types_df: bpd.DataFrame, snapshot): - bf_df = scalars_types_df[["int64_col"]] +def test_compile_readtable_w_ordering(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df[["int64_col"]] bf_df = bf_df.sort_values("int64_col") snapshot.assert_match(bf_df.sql, "out.sql") -def test_compile_readtable_w_limit(scalars_types_df: bpd.DataFrame, snapshot): - bf_df = scalars_types_df[["int64_col"]] +def test_compile_readtable_w_limit(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df[["int64_col"]] bf_df = bf_df.sort_index().head(10) snapshot.assert_match(bf_df.sql, "out.sql") From e3c06b4a07d0669a42460d081f1582b681ae3dd5 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Fri, 27 Jun 2025 16:41:43 -0700 Subject: [PATCH 22/28] docs: changed broken logo (#1866) * docs: change broken logo * use real url --- notebooks/experimental/ai_operators.ipynb | 4 ++-- notebooks/generative_ai/bq_dataframes_ai_forecast.ipynb | 4 ++-- .../generative_ai/bq_dataframes_llm_code_generation.ipynb | 4 ++-- notebooks/generative_ai/bq_dataframes_llm_kmeans.ipynb | 4 ++-- .../generative_ai/bq_dataframes_llm_output_schema.ipynb | 4 ++-- .../generative_ai/bq_dataframes_llm_vector_search.ipynb | 4 ++-- .../bq_dataframes_ml_drug_name_generation.ipynb | 4 ++-- notebooks/getting_started/bq_dataframes_template.ipynb | 4 ++-- .../getting_started/getting_started_bq_dataframes.ipynb | 6 +++--- .../getting_started/ml_fundamentals_bq_dataframes.ipynb | 4 ++-- notebooks/ml/bq_dataframes_ml_linear_regression.ipynb | 4 ++-- notebooks/ml/bq_dataframes_ml_linear_regression_big.ipynb | 4 ++-- notebooks/multimodal/multimodal_dataframe.ipynb | 4 ++-- .../remote_function_vertex_claude_model.ipynb | 4 ++-- .../visualization/bq_dataframes_covid_line_graphs.ipynb | 4 ++-- notebooks/visualization/tutorial.ipynb | 4 ++-- 16 files changed, 33 insertions(+), 33 deletions(-) diff --git a/notebooks/experimental/ai_operators.ipynb b/notebooks/experimental/ai_operators.ipynb index 07e20f6bbd..977f7b9d74 100644 --- a/notebooks/experimental/ai_operators.ipynb +++ b/notebooks/experimental/ai_operators.ipynb @@ -35,12 +35,12 @@ "\n", " \n", " \n", - " \"Colab Run in Colab\n", + " \"Colab Run in Colab\n", " \n", " \n", " \n", " \n", - " \"GitHub\n", + " \"GitHub\n", " View on GitHub\n", " \n", " \n", diff --git a/notebooks/generative_ai/bq_dataframes_ai_forecast.ipynb b/notebooks/generative_ai/bq_dataframes_ai_forecast.ipynb index 7734bd815d..5f6dede106 100644 --- a/notebooks/generative_ai/bq_dataframes_ai_forecast.ipynb +++ b/notebooks/generative_ai/bq_dataframes_ai_forecast.ipynb @@ -31,12 +31,12 @@ "\n", " \n", " \n", - " \"Colab Run in Colab\n", + " \"Colab Run in Colab\n", " \n", " \n", " \n", " \n", - " \"GitHub\n", + " \"GitHub\n", " View on GitHub\n", " \n", " \n", diff --git a/notebooks/generative_ai/bq_dataframes_llm_code_generation.ipynb b/notebooks/generative_ai/bq_dataframes_llm_code_generation.ipynb index edb864613c..68e10cb5ed 100644 --- a/notebooks/generative_ai/bq_dataframes_llm_code_generation.ipynb +++ b/notebooks/generative_ai/bq_dataframes_llm_code_generation.ipynb @@ -35,12 +35,12 @@ "\n", " \n", " \n", - " \"Colab Run in Colab\n", + " \"Colab Run in Colab\n", " \n", " \n", " \n", " \n", - " \"GitHub\n", + " \"GitHub\n", " View on GitHub\n", " \n", " \n", diff --git a/notebooks/generative_ai/bq_dataframes_llm_kmeans.ipynb b/notebooks/generative_ai/bq_dataframes_llm_kmeans.ipynb index 9b05e1ab02..bc55096942 100644 --- a/notebooks/generative_ai/bq_dataframes_llm_kmeans.ipynb +++ b/notebooks/generative_ai/bq_dataframes_llm_kmeans.ipynb @@ -32,12 +32,12 @@ "\n", " \n", " \n", - " \"Colab Run in Colab\n", + " \"Colab Run in Colab\n", " \n", " \n", " \n", " \n", - " \"GitHub\n", + " \"GitHub\n", " View on GitHub\n", " \n", " \n", diff --git a/notebooks/generative_ai/bq_dataframes_llm_output_schema.ipynb b/notebooks/generative_ai/bq_dataframes_llm_output_schema.ipynb index 04ea0571df..70714c823c 100644 --- a/notebooks/generative_ai/bq_dataframes_llm_output_schema.ipynb +++ b/notebooks/generative_ai/bq_dataframes_llm_output_schema.ipynb @@ -31,12 +31,12 @@ "\n", " \n", " \n", - " \"Colab Run in Colab\n", + " \"Colab Run in Colab\n", " \n", " \n", " \n", " \n", - " \"GitHub\n", + " \"GitHub\n", " View on GitHub\n", " \n", " \n", diff --git a/notebooks/generative_ai/bq_dataframes_llm_vector_search.ipynb b/notebooks/generative_ai/bq_dataframes_llm_vector_search.ipynb index 15929fd666..b964117b67 100644 --- a/notebooks/generative_ai/bq_dataframes_llm_vector_search.ipynb +++ b/notebooks/generative_ai/bq_dataframes_llm_vector_search.ipynb @@ -35,12 +35,12 @@ "\n", " \n", " \n", - " \"Colab Run in Colab\n", + " \"Colab Run in Colab\n", " \n", " \n", " \n", " \n", - " \"GitHub\n", + " \"GitHub\n", " View on GitHub\n", " \n", " \n", diff --git a/notebooks/generative_ai/bq_dataframes_ml_drug_name_generation.ipynb b/notebooks/generative_ai/bq_dataframes_ml_drug_name_generation.ipynb index 413e473c2f..3220bbf6cd 100644 --- a/notebooks/generative_ai/bq_dataframes_ml_drug_name_generation.ipynb +++ b/notebooks/generative_ai/bq_dataframes_ml_drug_name_generation.ipynb @@ -34,12 +34,12 @@ "\n", " \n", " \n", diff --git a/notebooks/getting_started/bq_dataframes_template.ipynb b/notebooks/getting_started/bq_dataframes_template.ipynb index ae772d035e..e8002fd611 100644 --- a/notebooks/getting_started/bq_dataframes_template.ipynb +++ b/notebooks/getting_started/bq_dataframes_template.ipynb @@ -35,12 +35,12 @@ "\n", " \n", " \n", diff --git a/notebooks/getting_started/getting_started_bq_dataframes.ipynb b/notebooks/getting_started/getting_started_bq_dataframes.ipynb index ccecd09cb9..384f3b9c10 100644 --- a/notebooks/getting_started/getting_started_bq_dataframes.ipynb +++ b/notebooks/getting_started/getting_started_bq_dataframes.ipynb @@ -35,12 +35,12 @@ "\n", " \n", " \n", @@ -1658,7 +1658,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.10" + "version": "3.10.15" } }, "nbformat": 4, diff --git a/notebooks/getting_started/ml_fundamentals_bq_dataframes.ipynb b/notebooks/getting_started/ml_fundamentals_bq_dataframes.ipynb index d95447f7e5..3370e94713 100644 --- a/notebooks/getting_started/ml_fundamentals_bq_dataframes.ipynb +++ b/notebooks/getting_started/ml_fundamentals_bq_dataframes.ipynb @@ -35,12 +35,12 @@ "\n", " \n", " \n", diff --git a/notebooks/ml/bq_dataframes_ml_linear_regression.ipynb b/notebooks/ml/bq_dataframes_ml_linear_regression.ipynb index 4123dd0e1c..00aa7a347c 100644 --- a/notebooks/ml/bq_dataframes_ml_linear_regression.ipynb +++ b/notebooks/ml/bq_dataframes_ml_linear_regression.ipynb @@ -36,12 +36,12 @@ "\n", " \n", " \n", diff --git a/notebooks/ml/bq_dataframes_ml_linear_regression_big.ipynb b/notebooks/ml/bq_dataframes_ml_linear_regression_big.ipynb index 0c5106f8f4..5c016f9157 100644 --- a/notebooks/ml/bq_dataframes_ml_linear_regression_big.ipynb +++ b/notebooks/ml/bq_dataframes_ml_linear_regression_big.ipynb @@ -36,12 +36,12 @@ "\n", " \n", " \n", diff --git a/notebooks/multimodal/multimodal_dataframe.ipynb b/notebooks/multimodal/multimodal_dataframe.ipynb index fbe074b0d0..f6f80b0009 100644 --- a/notebooks/multimodal/multimodal_dataframe.ipynb +++ b/notebooks/multimodal/multimodal_dataframe.ipynb @@ -33,12 +33,12 @@ "\n", " \n", " \n", diff --git a/notebooks/remote_functions/remote_function_vertex_claude_model.ipynb b/notebooks/remote_functions/remote_function_vertex_claude_model.ipynb index 605f879bc7..9792c90205 100644 --- a/notebooks/remote_functions/remote_function_vertex_claude_model.ipynb +++ b/notebooks/remote_functions/remote_function_vertex_claude_model.ipynb @@ -10,12 +10,12 @@ "\n", " \n", " \n", diff --git a/notebooks/visualization/bq_dataframes_covid_line_graphs.ipynb b/notebooks/visualization/bq_dataframes_covid_line_graphs.ipynb index b98589c2ae..f0dd5eb678 100644 --- a/notebooks/visualization/bq_dataframes_covid_line_graphs.ipynb +++ b/notebooks/visualization/bq_dataframes_covid_line_graphs.ipynb @@ -35,12 +35,12 @@ "\n", " \n", " \n", diff --git a/notebooks/visualization/tutorial.ipynb b/notebooks/visualization/tutorial.ipynb index 96aff12452..0923e03bc7 100644 --- a/notebooks/visualization/tutorial.ipynb +++ b/notebooks/visualization/tutorial.ipynb @@ -33,12 +33,12 @@ "\n", " \n", " \n", From fab3c387b2ad66043244fa813a366e613b41c60f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Mon, 30 Jun 2025 08:33:41 -0500 Subject: [PATCH 23/28] fix: generate GoogleSQL instead of legacy SQL data types for `dry_run=True` from `bpd._read_gbq_colab` with local pandas DataFrame (#1867) * fix: generate GoogleSQL instead of legacy SQL data types for `dry_run=True` from `bpd._read_gbq_colab` with local pandas DataFrame * finish adding system tests * add unit tests * map legacy sql types to googlesql * fix more unit tests * fix mypy * fix python 3.9 tests shapely --- bigframes/core/tools/bigquery_schema.py | 13 +- bigframes/pandas/io/api.py | 51 ++- tests/system/small/pandas/io/__init__.py | 13 + tests/system/small/pandas/io/api/__init__.py | 13 + .../pandas/io/api/test_read_gbq_colab.py | 329 ++++++++++++++++++ tests/unit/core/test_pyformat.py | 315 ++++++++++++++++- tests/unit/core/tools/test_bigquery_schema.py | 26 +- 7 files changed, 713 insertions(+), 47 deletions(-) create mode 100644 tests/system/small/pandas/io/__init__.py create mode 100644 tests/system/small/pandas/io/api/__init__.py create mode 100644 tests/system/small/pandas/io/api/test_read_gbq_colab.py diff --git a/bigframes/core/tools/bigquery_schema.py b/bigframes/core/tools/bigquery_schema.py index 227a69e0f7..eef7364a1b 100644 --- a/bigframes/core/tools/bigquery_schema.py +++ b/bigframes/core/tools/bigquery_schema.py @@ -18,6 +18,12 @@ import google.cloud.bigquery +_LEGACY_TO_GOOGLESQL_TYPES = { + "BOOLEAN": "BOOL", + "INTEGER": "INT64", + "FLOAT": "FLOAT64", +} + def _type_to_sql(field: google.cloud.bigquery.SchemaField): """Turn the type information of the field into SQL. @@ -26,7 +32,12 @@ def _type_to_sql(field: google.cloud.bigquery.SchemaField): """ if field.field_type.casefold() in ("record", "struct"): return _to_struct(field.fields) - return field.field_type + + # Map from legacy SQL names (the ones used in the BigQuery schema API) to + # the GoogleSQL types. Importantly, FLOAT is from legacy SQL, but not valid + # in GoogleSQL. See internal issue b/428190014. + type_ = _LEGACY_TO_GOOGLESQL_TYPES.get(field.field_type.upper(), field.field_type) + return type_ def _field_to_sql(field: google.cloud.bigquery.SchemaField): diff --git a/bigframes/pandas/io/api.py b/bigframes/pandas/io/api.py index 608eaf5a82..003de5913f 100644 --- a/bigframes/pandas/io/api.py +++ b/bigframes/pandas/io/api.py @@ -218,8 +218,27 @@ def read_gbq( read_gbq.__doc__ = inspect.getdoc(bigframes.session.Session.read_gbq) +def _run_read_gbq_colab_sessionless_dry_run( + query: str, + *, + pyformat_args: Dict[str, Any], +) -> pandas.Series: + """Run a dry_run without a session.""" + + query_formatted = bigframes.core.pyformat.pyformat( + query, + pyformat_args=pyformat_args, + dry_run=True, + ) + bqclient = _get_bqclient() + job = _dry_run(query_formatted, bqclient) + return dry_runs.get_query_stats_with_inferred_dtypes(job, (), ()) + + def _try_read_gbq_colab_sessionless_dry_run( - create_query: Callable[[], str], + query: str, + *, + pyformat_args: Dict[str, Any], ) -> Optional[pandas.Series]: """Run a dry_run without a session, only if the session hasn't yet started.""" @@ -230,10 +249,9 @@ def _try_read_gbq_colab_sessionless_dry_run( # to local data and not any BigQuery tables. with _default_location_lock: if not config.options.bigquery._session_started: - bqclient = _get_bqclient() - query = create_query() - job = _dry_run(query, bqclient) - return dry_runs.get_query_stats_with_inferred_dtypes(job, (), ()) + return _run_read_gbq_colab_sessionless_dry_run( + query, pyformat_args=pyformat_args + ) # Explicitly return None to indicate that we didn't run the dry run query. return None @@ -286,21 +304,13 @@ def _read_gbq_colab( if pyformat_args is None: pyformat_args = {} - # Delay formatting the query with the special "session-less" logic. This - # avoids doing unnecessary work if the session already has a location or has - # already started. - create_query = functools.partial( - bigframes.core.pyformat.pyformat, - query_or_table, - pyformat_args=pyformat_args, - dry_run=True, - ) - # Only try to set the global location if it's not a dry run. We don't want # to bind to a location too early. This is especially important if the query # only refers to local data and not any BigQuery tables. if dry_run: - result = _try_read_gbq_colab_sessionless_dry_run(create_query) + result = _try_read_gbq_colab_sessionless_dry_run( + query_or_table, pyformat_args=pyformat_args + ) if result is not None: return result @@ -309,6 +319,15 @@ def _read_gbq_colab( # started. That means we can safely call the "real" _read_gbq_colab, # which generates slightly nicer SQL. else: + # Delay formatting the query with the special "session-less" logic. This + # avoids doing unnecessary work if the session already has a location or has + # already started. + create_query = functools.partial( + bigframes.core.pyformat.pyformat, + query_or_table, + pyformat_args=pyformat_args, + dry_run=True, + ) _set_default_session_location_if_possible_deferred_query(create_query) return global_session.with_default_session( diff --git a/tests/system/small/pandas/io/__init__.py b/tests/system/small/pandas/io/__init__.py new file mode 100644 index 0000000000..0a2669d7a2 --- /dev/null +++ b/tests/system/small/pandas/io/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/system/small/pandas/io/api/__init__.py b/tests/system/small/pandas/io/api/__init__.py new file mode 100644 index 0000000000..0a2669d7a2 --- /dev/null +++ b/tests/system/small/pandas/io/api/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/system/small/pandas/io/api/test_read_gbq_colab.py b/tests/system/small/pandas/io/api/test_read_gbq_colab.py new file mode 100644 index 0000000000..6e848ed9ea --- /dev/null +++ b/tests/system/small/pandas/io/api/test_read_gbq_colab.py @@ -0,0 +1,329 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import datetime +import decimal + +import db_dtypes # type: ignore +import geopandas # type: ignore +import numpy +import pandas +import pyarrow +import pytest +import shapely.geometry # type: ignore + +from bigframes.pandas.io import api as module_under_test + + +@pytest.mark.parametrize( + ("df_pd",), + ( + # Regression tests for b/428190014. + # + # Test every BigQuery type we support, especially those where the legacy + # SQL type name differs from the GoogleSQL type name. + # + # See: + # https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types + # and compare to the legacy types at + # https://cloud.google.com/bigquery/docs/data-types + pytest.param( + pandas.DataFrame( + { + "ints": pandas.Series( + [[1], [2], [3]], + dtype=pandas.ArrowDtype(pyarrow.list_(pyarrow.int64())), + ), + "floats": pandas.Series( + [[1.0], [2.0], [3.0]], + dtype=pandas.ArrowDtype(pyarrow.list_(pyarrow.float64())), + ), + } + ), + id="arrays", + ), + pytest.param( + pandas.DataFrame( + { + "bool": pandas.Series([True, False, True], dtype="bool"), + "boolean": pandas.Series([True, None, True], dtype="boolean"), + "object": pandas.Series([True, None, True], dtype="object"), + "arrow": pandas.Series( + [True, None, True], dtype=pandas.ArrowDtype(pyarrow.bool_()) + ), + } + ), + id="bools", + ), + pytest.param( + pandas.DataFrame( + { + "bytes": pandas.Series([b"a", b"b", b"c"], dtype=numpy.bytes_), + "object": pandas.Series([b"a", None, b"c"], dtype="object"), + "arrow": pandas.Series( + [b"a", None, b"c"], dtype=pandas.ArrowDtype(pyarrow.binary()) + ), + } + ), + id="bytes", + ), + pytest.param( + pandas.DataFrame( + { + "object": pandas.Series( + [ + datetime.date(2023, 11, 23), + None, + datetime.date(1970, 1, 1), + ], + dtype="object", + ), + "arrow": pandas.Series( + [ + datetime.date(2023, 11, 23), + None, + datetime.date(1970, 1, 1), + ], + dtype=pandas.ArrowDtype(pyarrow.date32()), + ), + } + ), + id="dates", + ), + pytest.param( + pandas.DataFrame( + { + "object": pandas.Series( + [ + datetime.datetime(2023, 11, 23, 13, 14, 15), + None, + datetime.datetime(1970, 1, 1, 0, 0, 0), + ], + dtype="object", + ), + "datetime64": pandas.Series( + [ + datetime.datetime(2023, 11, 23, 13, 14, 15), + None, + datetime.datetime(1970, 1, 1, 0, 0, 0), + ], + dtype="datetime64[us]", + ), + "arrow": pandas.Series( + [ + datetime.datetime(2023, 11, 23, 13, 14, 15), + None, + datetime.datetime(1970, 1, 1, 0, 0, 0), + ], + dtype=pandas.ArrowDtype(pyarrow.timestamp("us")), + ), + } + ), + id="datetimes", + ), + pytest.param( + pandas.DataFrame( + { + "object": pandas.Series( + [ + shapely.geometry.Point(145.0, -37.8), + None, + shapely.geometry.Point(-122.3, 47.6), + ], + dtype="object", + ), + "geopandas": geopandas.GeoSeries( + [ + shapely.geometry.Point(145.0, -37.8), + None, + shapely.geometry.Point(-122.3, 47.6), + ] + ), + } + ), + id="geographys", + ), + # TODO(tswast): Add INTERVAL once BigFrames supports it. + pytest.param( + pandas.DataFrame( + { + # TODO(tswast): Is there an equivalent object type we can use here? + # TODO(tswast): Add built-in Arrow extension type + "db_dtypes": pandas.Series( + ["{}", None, "123"], + dtype=pandas.ArrowDtype(db_dtypes.JSONArrowType()), + ), + } + ), + id="jsons", + ), + pytest.param( + pandas.DataFrame( + { + "int64": pandas.Series([1, 2, 3], dtype="int64"), + "Int64": pandas.Series([1, None, 3], dtype="Int64"), + "object": pandas.Series([1, None, 3], dtype="object"), + "arrow": pandas.Series( + [1, None, 3], dtype=pandas.ArrowDtype(pyarrow.int64()) + ), + } + ), + id="ints", + ), + pytest.param( + pandas.DataFrame( + { + "object": pandas.Series( + [decimal.Decimal("1.23"), None, decimal.Decimal("4.56")], + dtype="object", + ), + "arrow": pandas.Series( + [decimal.Decimal("1.23"), None, decimal.Decimal("4.56")], + dtype=pandas.ArrowDtype(pyarrow.decimal128(38, 9)), + ), + } + ), + id="numerics", + ), + pytest.param( + pandas.DataFrame( + { + # TODO(tswast): Add object type for BIGNUMERIC. Can bigframes disambiguate? + "arrow": pandas.Series( + [decimal.Decimal("1.23"), None, decimal.Decimal("4.56")], + dtype=pandas.ArrowDtype(pyarrow.decimal256(76, 38)), + ), + } + ), + id="bignumerics", + ), + pytest.param( + pandas.DataFrame( + { + "float64": pandas.Series([1.23, None, 4.56], dtype="float64"), + "Float64": pandas.Series([1.23, None, 4.56], dtype="Float64"), + "object": pandas.Series([1.23, None, 4.56], dtype="object"), + "arrow": pandas.Series( + [1.23, None, 4.56], dtype=pandas.ArrowDtype(pyarrow.float64()) + ), + } + ), + id="floats", + ), + # TODO(tswast): Add RANGE once BigFrames supports it. + pytest.param( + pandas.DataFrame( + { + "string": pandas.Series(["a", "b", "c"], dtype="string[python]"), + "object": pandas.Series(["a", None, "c"], dtype="object"), + "arrow": pandas.Series(["a", None, "c"], dtype="string[pyarrow]"), + } + ), + id="strings", + ), + pytest.param( + pandas.DataFrame( + { + # TODO(tswast): Add object type for STRUCT? How to tell apart from JSON? + "arrow": pandas.Series( + [{"a": 1, "b": 1.0, "c": "c"}], + dtype=pandas.ArrowDtype( + pyarrow.struct( + [ + ("a", pyarrow.int64()), + ("b", pyarrow.float64()), + ("c", pyarrow.string()), + ] + ) + ), + ), + } + ), + id="structs", + ), + pytest.param( + pandas.DataFrame( + { + "object": pandas.Series( + [ + datetime.time(0, 0, 0), + None, + datetime.time(13, 7, 11), + ], + dtype="object", + ), + "arrow": pandas.Series( + [ + datetime.time(0, 0, 0), + None, + datetime.time(13, 7, 11), + ], + dtype=pandas.ArrowDtype(pyarrow.time64("us")), + ), + } + ), + id="times", + ), + pytest.param( + pandas.DataFrame( + { + "object": pandas.Series( + [ + datetime.datetime( + 2023, 11, 23, 13, 14, 15, tzinfo=datetime.timezone.utc + ), + None, + datetime.datetime( + 1970, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc + ), + ], + dtype="object", + ), + "datetime64": pandas.Series( + [ + datetime.datetime(2023, 11, 23, 13, 14, 15), + None, + datetime.datetime(1970, 1, 1, 0, 0, 0), + ], + dtype="datetime64[us]", + ).dt.tz_localize("UTC"), + "arrow": pandas.Series( + [ + datetime.datetime( + 2023, 11, 23, 13, 14, 15, tzinfo=datetime.timezone.utc + ), + None, + datetime.datetime( + 1970, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc + ), + ], + dtype=pandas.ArrowDtype(pyarrow.timestamp("us", "UTC")), + ), + } + ), + id="timestamps", + ), + ), +) +def test_read_gbq_colab_sessionless_dry_run_generates_valid_sql_for_local_dataframe( + df_pd: pandas.DataFrame, +): + # This method will fail with an exception if it receives invalid SQL. + result = module_under_test._run_read_gbq_colab_sessionless_dry_run( + query="SELECT * FROM {df_pd}", + pyformat_args={"df_pd": df_pd}, + ) + assert isinstance(result, pandas.Series) diff --git a/tests/unit/core/test_pyformat.py b/tests/unit/core/test_pyformat.py index 05110d8485..447ce37766 100644 --- a/tests/unit/core/test_pyformat.py +++ b/tests/unit/core/test_pyformat.py @@ -19,13 +19,19 @@ from __future__ import annotations +import datetime import decimal from typing import Any, Dict, List +import db_dtypes # type: ignore +import geopandas # type: ignore import google.cloud.bigquery import google.cloud.bigquery.table +import numpy import pandas +import pyarrow import pytest +import shapely.geometry # type: ignore from bigframes.core import pyformat from bigframes.testing import mocks @@ -91,42 +97,313 @@ def test_pyformat_with_no_variables(session): pytest.param( # Empty columns default to floating point, just like pandas. pandas.DataFrame({"empty column": []}), - "STRUCT<`empty column` FLOAT>", + "STRUCT<`empty column` FLOAT64>", id="empty column", ), + # Regression tests for b/428190014. + # + # Test every BigQuery type we support, especially those where the legacy + # SQL type name differs from the GoogleSQL type name. + # + # See: + # https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types + # and compare to the legacy types at + # https://cloud.google.com/bigquery/docs/data-types + # + # Test these against the real BigQuery dry run API in + # tests/system/small/pandas/io/api/test_read_gbq_colab.py pytest.param( pandas.DataFrame( { - "col1": [1, 2, 3], - "col2": ["a", "b", "c"], - "col3": [ - decimal.Decimal(1), - decimal.Decimal(2), - decimal.Decimal(3), - ], + "ints": pandas.Series( + [[1], [2], [3]], + dtype=pandas.ArrowDtype(pyarrow.list_(pyarrow.int64())), + ), + "floats": pandas.Series( + [[1.0], [2.0], [3.0]], + dtype=pandas.ArrowDtype(pyarrow.list_(pyarrow.float64())), + ), } ), - "STRUCT<`col1` INTEGER, `col2` STRING, `col3` NUMERIC>", - id="scalars", + "STRUCT<`ints` ARRAY, `floats` ARRAY>", + id="arrays", ), pytest.param( pandas.DataFrame( - {"array col": [[1, 2, 3]], "another array": [["a", "b", "c"]]} + { + "bool": pandas.Series([True, False, True], dtype="bool"), + "boolean": pandas.Series([True, None, True], dtype="boolean"), + "object": pandas.Series([True, None, True], dtype="object"), + "arrow": pandas.Series( + [True, None, True], dtype=pandas.ArrowDtype(pyarrow.bool_()) + ), + } ), - "STRUCT<`array col` ARRAY, `another array` ARRAY>", - id="arrays", + "STRUCT<`bool` BOOL, `boolean` BOOL, `object` BOOL, `arrow` BOOL>", + id="bools", ), pytest.param( pandas.DataFrame( { - "struct col": [ - {"subfield": {"subsubfield": 1}, "subfield2": 2}, - ], + "bytes": pandas.Series([b"a", b"b", b"c"], dtype=numpy.bytes_), + "object": pandas.Series([b"a", None, b"c"], dtype="object"), + "arrow": pandas.Series( + [b"a", None, b"c"], dtype=pandas.ArrowDtype(pyarrow.binary()) + ), + } + ), + "STRUCT<`bytes` BYTES, `object` BYTES, `arrow` BYTES>", + id="bytes", + ), + pytest.param( + pandas.DataFrame( + { + "object": pandas.Series( + [ + datetime.date(2023, 11, 23), + None, + datetime.date(1970, 1, 1), + ], + dtype="object", + ), + "arrow": pandas.Series( + [ + datetime.date(2023, 11, 23), + None, + datetime.date(1970, 1, 1), + ], + dtype=pandas.ArrowDtype(pyarrow.date32()), + ), + } + ), + "STRUCT<`object` DATE, `arrow` DATE>", + id="dates", + ), + pytest.param( + pandas.DataFrame( + { + "object": pandas.Series( + [ + datetime.datetime(2023, 11, 23, 13, 14, 15), + None, + datetime.datetime(1970, 1, 1, 0, 0, 0), + ], + dtype="object", + ), + "datetime64": pandas.Series( + [ + datetime.datetime(2023, 11, 23, 13, 14, 15), + None, + datetime.datetime(1970, 1, 1, 0, 0, 0), + ], + dtype="datetime64[us]", + ), + "arrow": pandas.Series( + [ + datetime.datetime(2023, 11, 23, 13, 14, 15), + None, + datetime.datetime(1970, 1, 1, 0, 0, 0), + ], + dtype=pandas.ArrowDtype(pyarrow.timestamp("us")), + ), + } + ), + "STRUCT<`object` DATETIME, `datetime64` DATETIME, `arrow` DATETIME>", + id="datetimes", + ), + pytest.param( + pandas.DataFrame( + { + "object": pandas.Series( + [ + shapely.geometry.Point(145.0, -37.8), + None, + shapely.geometry.Point(-122.3, 47.6), + ], + dtype="object", + ), + "geopandas": geopandas.GeoSeries( + [ + shapely.geometry.Point(145.0, -37.8), + None, + shapely.geometry.Point(-122.3, 47.6), + ] + ), + } + ), + "STRUCT<`object` GEOGRAPHY, `geopandas` GEOGRAPHY>", + id="geographys", + ), + # TODO(tswast): Add INTERVAL once BigFrames supports it. + pytest.param( + pandas.DataFrame( + { + # TODO(tswast): Is there an equivalent object type we can use here? + # TODO(tswast): Add built-in Arrow extension type + "db_dtypes": pandas.Series( + ["{}", None, "123"], + dtype=pandas.ArrowDtype(db_dtypes.JSONArrowType()), + ), + } + ), + "STRUCT<`db_dtypes` JSON>", + id="jsons", + ), + pytest.param( + pandas.DataFrame( + { + "int64": pandas.Series([1, 2, 3], dtype="int64"), + "Int64": pandas.Series([1, None, 3], dtype="Int64"), + "object": pandas.Series([1, None, 3], dtype="object"), + "arrow": pandas.Series( + [1, None, 3], dtype=pandas.ArrowDtype(pyarrow.int64()) + ), + } + ), + "STRUCT<`int64` INT64, `Int64` INT64, `object` INT64, `arrow` INT64>", + id="ints", + ), + pytest.param( + pandas.DataFrame( + { + "object": pandas.Series( + [decimal.Decimal("1.23"), None, decimal.Decimal("4.56")], + dtype="object", + ), + "arrow": pandas.Series( + [decimal.Decimal("1.23"), None, decimal.Decimal("4.56")], + dtype=pandas.ArrowDtype(pyarrow.decimal128(38, 9)), + ), + } + ), + "STRUCT<`object` NUMERIC, `arrow` NUMERIC>", + id="numerics", + ), + pytest.param( + pandas.DataFrame( + { + # TODO(tswast): Add object type for BIGNUMERIC. Can bigframes disambiguate? + "arrow": pandas.Series( + [decimal.Decimal("1.23"), None, decimal.Decimal("4.56")], + dtype=pandas.ArrowDtype(pyarrow.decimal256(76, 38)), + ), + } + ), + "STRUCT<`arrow` BIGNUMERIC>", + id="bignumerics", + ), + pytest.param( + pandas.DataFrame( + { + "float64": pandas.Series([1.23, None, 4.56], dtype="float64"), + "Float64": pandas.Series([1.23, None, 4.56], dtype="Float64"), + "object": pandas.Series([1.23, None, 4.56], dtype="object"), + "arrow": pandas.Series( + [1.23, None, 4.56], dtype=pandas.ArrowDtype(pyarrow.float64()) + ), } ), - "STRUCT<`struct col` STRUCT<`subfield` STRUCT<`subsubfield` INTEGER>, `subfield2` INTEGER>>", + "STRUCT<`float64` FLOAT64, `Float64` FLOAT64, `object` FLOAT64, `arrow` FLOAT64>", + id="floats", + ), + # TODO(tswast): Add RANGE once BigFrames supports it. + pytest.param( + pandas.DataFrame( + { + "string": pandas.Series(["a", "b", "c"], dtype="string[python]"), + "object": pandas.Series(["a", None, "c"], dtype="object"), + "arrow": pandas.Series(["a", None, "c"], dtype="string[pyarrow]"), + } + ), + "STRUCT<`string` STRING, `object` STRING, `arrow` STRING>", + id="strings", + ), + pytest.param( + pandas.DataFrame( + { + # TODO(tswast): Add object type for STRUCT? How to tell apart from JSON? + "arrow": pandas.Series( + [{"a": 1, "b": 1.0, "c": "c"}], + dtype=pandas.ArrowDtype( + pyarrow.struct( + [ + ("a", pyarrow.int64()), + ("b", pyarrow.float64()), + ("c", pyarrow.string()), + ] + ) + ), + ), + } + ), + "STRUCT<`arrow` STRUCT<`a` INT64, `b` FLOAT64, `c` STRING>>", id="structs", ), + pytest.param( + pandas.DataFrame( + { + "object": pandas.Series( + [ + datetime.time(0, 0, 0), + None, + datetime.time(13, 7, 11), + ], + dtype="object", + ), + "arrow": pandas.Series( + [ + datetime.time(0, 0, 0), + None, + datetime.time(13, 7, 11), + ], + dtype=pandas.ArrowDtype(pyarrow.time64("us")), + ), + } + ), + "STRUCT<`object` TIME, `arrow` TIME>", + id="times", + ), + pytest.param( + pandas.DataFrame( + { + "object": pandas.Series( + [ + datetime.datetime( + 2023, 11, 23, 13, 14, 15, tzinfo=datetime.timezone.utc + ), + None, + datetime.datetime( + 1970, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc + ), + ], + dtype="object", + ), + "datetime64": pandas.Series( + [ + datetime.datetime(2023, 11, 23, 13, 14, 15), + None, + datetime.datetime(1970, 1, 1, 0, 0, 0), + ], + dtype="datetime64[us]", + ).dt.tz_localize("UTC"), + "arrow": pandas.Series( + [ + datetime.datetime( + 2023, 11, 23, 13, 14, 15, tzinfo=datetime.timezone.utc + ), + None, + datetime.datetime( + 1970, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc + ), + ], + dtype=pandas.ArrowDtype(pyarrow.timestamp("us", "UTC")), + ), + } + ), + "STRUCT<`object` TIMESTAMP, `datetime64` TIMESTAMP, `arrow` TIMESTAMP>", + id="timestamps", + ), + # More complicated edge cases: pytest.param( pandas.DataFrame( { @@ -135,14 +412,14 @@ def test_pyformat_with_no_variables(session): ], } ), - "STRUCT<`array of struct col` ARRAY, `subfield2` INTEGER>>>", + "STRUCT<`array of struct col` ARRAY, `subfield2` INT64>>>", id="array_of_structs", ), pytest.param( pandas.DataFrame({"c1": [1, 2, 3], "c2": ["a", "b", "c"]}).rename( columns={"c1": "c", "c2": "c"} ), - "STRUCT<`c` INTEGER, `c_1` STRING>", + "STRUCT<`c` INT64, `c_1` STRING>", id="duplicate_column_names", ), ), diff --git a/tests/unit/core/tools/test_bigquery_schema.py b/tests/unit/core/tools/test_bigquery_schema.py index a5b0087801..aed8ae0323 100644 --- a/tests/unit/core/tools/test_bigquery_schema.py +++ b/tests/unit/core/tools/test_bigquery_schema.py @@ -9,9 +9,11 @@ "field, expected_sql", [ # Simple types - (bigquery.SchemaField("test_field", "INTEGER"), "INTEGER"), + # Note: the REST API will return Legacy SQL data types, but we need to + # map to GoogleSQL. See internal issue b/428190014. + (bigquery.SchemaField("test_field", "INTEGER"), "INT64"), (bigquery.SchemaField("test_field", "STRING"), "STRING"), - (bigquery.SchemaField("test_field", "BOOLEAN"), "BOOLEAN"), + (bigquery.SchemaField("test_field", "BOOLEAN"), "BOOL"), # RECORD/STRUCT types with nested fields directly ( bigquery.SchemaField( @@ -30,7 +32,7 @@ bigquery.SchemaField("another", "BOOLEAN"), ), ), - "STRUCT<`sub_field` INTEGER, `another` BOOLEAN>", + "STRUCT<`sub_field` INT64, `another` BOOL>", ), # Array is handled by _field_to_sql, instead. (bigquery.SchemaField("test_field", "NUMERIC", mode="REPEATED"), "NUMERIC"), @@ -54,7 +56,9 @@ def test_type_to_sql(field, expected_sql): "field, expected_sql", [ # Simple field - (bigquery.SchemaField("id", "INTEGER", "NULLABLE"), "`id` INTEGER"), + # Note: the REST API will return Legacy SQL data types, but we need to + # map to GoogleSQL. See internal issue b/428190014. + (bigquery.SchemaField("id", "INTEGER", "NULLABLE"), "`id` INT64"), (bigquery.SchemaField("name", "STRING", "NULLABLE"), "`name` STRING"), # Repeated field (bigquery.SchemaField("tags", "STRING", "REPEATED"), "`tags` ARRAY"), @@ -69,7 +73,7 @@ def test_type_to_sql(field, expected_sql): bigquery.SchemaField("zip", "INTEGER"), ), ), - "`addresses` ARRAY>", + "`addresses` ARRAY>", ), # Simple STRUCT ( @@ -82,7 +86,7 @@ def test_type_to_sql(field, expected_sql): bigquery.SchemaField("city", "STRING"), ), ), - "`person` STRUCT<`age` INTEGER, `city` STRING>", + "`person` STRUCT<`age` INT64, `city` STRING>", ), ], ) @@ -102,7 +106,7 @@ def test_field_to_sql(field, expected_sql): bigquery.SchemaField("id", "INTEGER"), bigquery.SchemaField("name", "STRING"), ), - "STRUCT<`id` INTEGER, `name` STRING>", + "STRUCT<`id` INT64, `name` STRING>", ), # Nested RECORD/STRUCT ( @@ -118,7 +122,7 @@ def test_field_to_sql(field, expected_sql): ), ), ), - "STRUCT<`item_id` INTEGER, `details` STRUCT<`price` NUMERIC, `currency` STRING>>", + "STRUCT<`item_id` INT64, `details` STRUCT<`price` NUMERIC, `currency` STRING>>", ), # Repeated field ( @@ -143,7 +147,7 @@ def test_field_to_sql(field, expected_sql): ), bigquery.SchemaField("timestamp", "TIMESTAMP"), ), - "STRUCT<`event_name` STRING, `participants` ARRAY>>, `timestamp` TIMESTAMP>", + "STRUCT<`event_name` STRING, `participants` ARRAY>>, `timestamp` TIMESTAMP>", ), ], ) @@ -163,7 +167,7 @@ def test_to_struct(bqschema, expected_sql): bigquery.SchemaField("id", "INTEGER"), bigquery.SchemaField("name", "STRING"), ), - "UNNEST(ARRAY>[])", + "UNNEST(ARRAY>[])", ), # Complex schema with nested and repeated fields ( @@ -179,7 +183,7 @@ def test_to_struct(bqschema, expected_sql): ), ), ), - "UNNEST(ARRAY>>>[])", + "UNNEST(ARRAY>>>[])", ), ], ) From 633bf98fde33264be4fc9d7454e541c560589152 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Mon, 30 Jun 2025 08:34:04 -0500 Subject: [PATCH 24/28] feat: add `bpd.read_arrow` to convert an Arrow object into a bigframes DataFrame (#1855) * feat: Add read_arrow methods to Session and pandas Adds `read_arrow` methods to `bigframes.session.Session` and `bigframes.pandas.read_arrow` for creating BigQuery DataFrames DataFrames from PyArrow Tables. The implementation refactors existing logic from `bigframes.session._io.bigquery.read_gbq_query` for converting Arrow data into BigFrames DataFrames. Includes: - New file `bigframes/session/_io/arrow.py` with the core conversion logic. - `read_arrow(pa.Table) -> bpd.DataFrame` in `Session` class. - `read_arrow(pa.Table) -> bpd.DataFrame` in `pandas` module. - Unit and system tests for the new functionality. - Docstrings for new methods/functions. Note: Unit tests for direct DataFrame operations (shape, to_pandas) on the result of read_arrow are currently failing due to the complexity of mocking the session and executor for LocalDataNode interactions. System tests are recommended for full end-to-end validation. * rearrange * fix unit tests --------- Co-authored-by: google-labs-jules[bot] <161369871+google-labs-jules[bot]@users.noreply.github.com> --- bigframes/core/blocks.py | 31 +++++++ bigframes/pandas/__init__.py | 2 + bigframes/pandas/io/api.py | 16 ++++ bigframes/session/__init__.py | 18 ++++ tests/unit/session/test_io_arrow.py | 133 ++++++++++++++++++++++++++++ 5 files changed, 200 insertions(+) create mode 100644 tests/unit/session/test_io_arrow.py diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 1426459912..312035c7e0 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -50,6 +50,7 @@ import bigframes.core.identifiers import bigframes.core.join_def as join_defs import bigframes.core.ordering as ordering +import bigframes.core.pyarrow_utils as pyarrow_utils import bigframes.core.schema as bf_schema import bigframes.core.sql as sql import bigframes.core.utils as utils @@ -156,6 +157,36 @@ def __init__( self._view_ref: Optional[bigquery.TableReference] = None self._view_ref_dry_run: Optional[bigquery.TableReference] = None + @classmethod + def from_pyarrow( + cls, + data: pa.Table, + session: bigframes.Session, + ) -> Block: + column_labels = data.column_names + + # TODO(tswast): Use array_value.promote_offsets() instead once that node is + # supported by the local engine. + offsets_col = bigframes.core.guid.generate_guid() + index_ids = [offsets_col] + index_labels = [None] + + # TODO(https://github.com/googleapis/python-bigquery-dataframes/issues/859): + # Allow users to specify the "total ordering" column(s) or allow multiple + # such columns. + data = pyarrow_utils.append_offsets(data, offsets_col=offsets_col) + + # from_pyarrow will normalize the types for us. + managed_data = local_data.ManagedArrowTable.from_pyarrow(data) + array_value = core.ArrayValue.from_managed(managed_data, session=session) + block = cls( + array_value, + column_labels=column_labels, + index_columns=index_ids, + index_labels=index_labels, + ) + return block + @classmethod def from_local( cls, diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index ed999e62c1..f163d25757 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -40,6 +40,7 @@ from bigframes.pandas.io.api import ( _read_gbq_colab, from_glob_path, + read_arrow, read_csv, read_gbq, read_gbq_function, @@ -367,6 +368,7 @@ def reset_session(): merge, qcut, read_csv, + read_arrow, read_gbq, _read_gbq_colab, read_gbq_function, diff --git a/bigframes/pandas/io/api.py b/bigframes/pandas/io/api.py index 003de5913f..65435bd902 100644 --- a/bigframes/pandas/io/api.py +++ b/bigframes/pandas/io/api.py @@ -44,6 +44,7 @@ ReadPickleBuffer, StorageOptions, ) +import pyarrow as pa import bigframes._config as config import bigframes.core.global_session as global_session @@ -72,6 +73,21 @@ # method and its arguments. +def read_arrow(pa_table: pa.Table) -> bigframes.dataframe.DataFrame: + """Load a PyArrow Table to a BigQuery DataFrames DataFrame. + + Args: + pa_table (pyarrow.Table): + PyArrow table to load data from. + + Returns: + bigframes.dataframe.DataFrame: + A new DataFrame representing the data from the PyArrow table. + """ + session = global_session.get_global_session() + return session.read_arrow(pa_table=pa_table) + + def read_csv( filepath_or_buffer: str | IO["bytes"], *, diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 8cbcf8612e..9d113743cf 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -55,12 +55,14 @@ ReadPickleBuffer, StorageOptions, ) +import pyarrow as pa from bigframes import exceptions as bfe from bigframes import version import bigframes._config.bigquery_options as bigquery_options import bigframes.clients import bigframes.constants +import bigframes.core from bigframes.core import blocks, log_adapter, utils import bigframes.core.pyformat @@ -967,6 +969,22 @@ def _read_pandas_inline( local_block = blocks.Block.from_local(pandas_dataframe, self) return dataframe.DataFrame(local_block) + def read_arrow(self, pa_table: pa.Table) -> bigframes.dataframe.DataFrame: + """Load a PyArrow Table to a BigQuery DataFrames DataFrame. + + Args: + pa_table (pyarrow.Table): + PyArrow table to load data from. + + Returns: + bigframes.dataframe.DataFrame: + A new DataFrame representing the data from the PyArrow table. + """ + import bigframes.dataframe as dataframe + + local_block = blocks.Block.from_pyarrow(pa_table, self) + return dataframe.DataFrame(local_block) + def read_csv( self, filepath_or_buffer: str | IO["bytes"], diff --git a/tests/unit/session/test_io_arrow.py b/tests/unit/session/test_io_arrow.py new file mode 100644 index 0000000000..d5266220d9 --- /dev/null +++ b/tests/unit/session/test_io_arrow.py @@ -0,0 +1,133 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import datetime + +import pyarrow as pa +import pytest + +import bigframes.pandas as bpd +from bigframes.testing import mocks + + +@pytest.fixture(scope="module") +def session(): + # Use the mock session from bigframes.testing + return mocks.create_bigquery_session() + + +def test_read_arrow_empty_table(session): + empty_table = pa.Table.from_pydict( + { + "col_a": pa.array([], type=pa.int64()), + "col_b": pa.array([], type=pa.string()), + } + ) + df = session.read_arrow(empty_table) + assert isinstance(df, bpd.DataFrame) + assert df.shape == (0, 2) + assert list(df.columns) == ["col_a", "col_b"] + pd_df = df.to_pandas() + assert pd_df.empty + assert list(pd_df.columns) == ["col_a", "col_b"] + assert pd_df["col_a"].dtype == "Int64" + assert pd_df["col_b"].dtype == "string[pyarrow]" + + +@pytest.mark.parametrize( + "data,arrow_type,expected_bq_type_kind", + [ + ([1, 2], pa.int8(), "INTEGER"), + ([1, 2], pa.int16(), "INTEGER"), + ([1, 2], pa.int32(), "INTEGER"), + ([1, 2], pa.int64(), "INTEGER"), + ([1.0, 2.0], pa.float32(), "FLOAT"), + ([1.0, 2.0], pa.float64(), "FLOAT"), + ([True, False], pa.bool_(), "BOOLEAN"), + (["a", "b"], pa.string(), "STRING"), + (["a", "b"], pa.large_string(), "STRING"), + ([b"a", b"b"], pa.binary(), "BYTES"), + ([b"a", b"b"], pa.large_binary(), "BYTES"), + ( + [ + pa.scalar(1000, type=pa.duration("s")), + pa.scalar(2000, type=pa.duration("s")), + ], + pa.duration("s"), + "INTEGER", + ), + ([datetime.date(2023, 1, 1)], pa.date32(), "DATE"), + ( + [datetime.datetime(2023, 1, 1, 12, 0, 0, tzinfo=datetime.timezone.utc)], + pa.timestamp("s", tz="UTC"), + "TIMESTAMP", + ), + ( + [datetime.datetime(2023, 1, 1, 12, 0, 0, tzinfo=datetime.timezone.utc)], + pa.timestamp("ms", tz="UTC"), + "TIMESTAMP", + ), + ( + [datetime.datetime(2023, 1, 1, 12, 0, 0, tzinfo=datetime.timezone.utc)], + pa.timestamp("us", tz="UTC"), + "TIMESTAMP", + ), + ([datetime.time(12, 34, 56, 789000)], pa.time64("us"), "TIME"), + ], +) +def test_read_arrow_type_mappings(session, data, arrow_type, expected_bq_type_kind): + """ + Tests that various arrow types are mapped to the expected BigQuery types. + This is an indirect check via the resulting DataFrame's schema. + """ + pa_table = pa.Table.from_arrays([pa.array(data, type=arrow_type)], names=["col"]) + df = session.read_arrow(pa_table) + + bigquery_schema = df._block.expr.schema.to_bigquery() + assert len(bigquery_schema) == 2 # offsets + value + field = bigquery_schema[-1] + assert field.field_type.upper() == expected_bq_type_kind + + # Also check pandas dtype after conversion for good measure + pd_df = df.to_pandas() + assert pd_df["col"].shape == (len(data),) + + +def test_read_arrow_list_type(session): + pa_table = pa.Table.from_arrays( + [pa.array([[1, 2], [3, 4, 5]], type=pa.list_(pa.int64()))], names=["list_col"] + ) + df = session.read_arrow(pa_table) + + bigquery_schema = df._block.expr.schema.to_bigquery() + assert len(bigquery_schema) == 2 # offsets + value + field = bigquery_schema[-1] + assert field.mode.upper() == "REPEATED" + assert field.field_type.upper() == "INTEGER" + + +def test_read_arrow_struct_type(session): + struct_type = pa.struct([("a", pa.int64()), ("b", pa.string())]) + pa_table = pa.Table.from_arrays( + [pa.array([{"a": 1, "b": "x"}, {"a": 2, "b": "y"}], type=struct_type)], + names=["struct_col"], + ) + df = session.read_arrow(pa_table) + + bigquery_schema = df._block.expr.schema.to_bigquery() + assert len(bigquery_schema) == 2 # offsets + value + field = bigquery_schema[-1] + assert field.field_type.upper() == "RECORD" + assert field.fields[0].name == "a" + assert field.fields[1].name == "b" From 1c45ccb133091aa85bc34450704fc8cab3d9296b Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Mon, 30 Jun 2025 09:52:09 -0700 Subject: [PATCH 25/28] feat: Support local execution of comparison ops (#1849) --- bigframes/core/compile/polars/lowering.py | 55 ++++++++++++++- bigframes/core/compile/scalar_op_compiler.py | 18 +++++ bigframes/session/polars_executor.py | 11 ++- .../small/engines/test_comparison_ops.py | 70 +++++++++++++++++++ 4 files changed, 151 insertions(+), 3 deletions(-) create mode 100644 tests/system/small/engines/test_comparison_ops.py diff --git a/bigframes/core/compile/polars/lowering.py b/bigframes/core/compile/polars/lowering.py index 88e2d6e599..48d63e9ed9 100644 --- a/bigframes/core/compile/polars/lowering.py +++ b/bigframes/core/compile/polars/lowering.py @@ -12,15 +12,31 @@ # See the License for the specific language governing permissions and # limitations under the License. +import dataclasses + from bigframes import dtypes from bigframes.core import bigframe_node, expression from bigframes.core.rewrite import op_lowering -from bigframes.operations import numeric_ops +from bigframes.operations import comparison_ops, numeric_ops import bigframes.operations as ops # TODO: Would be more precise to actually have separate op set for polars ops (where they diverge from the original ops) +@dataclasses.dataclass +class CoerceArgsRule(op_lowering.OpLoweringRule): + op_type: type[ops.BinaryOp] + + @property + def op(self) -> type[ops.ScalarOp]: + return self.op_type + + def lower(self, expr: expression.OpExpression) -> expression.Expression: + assert isinstance(expr.op, self.op_type) + larg, rarg = _coerce_comparables(expr.children[0], expr.children[1]) + return expr.op.as_expr(larg, rarg) + + class LowerFloorDivRule(op_lowering.OpLoweringRule): @property def op(self) -> type[ops.ScalarOp]: @@ -40,7 +56,42 @@ def lower(self, expr: expression.OpExpression) -> expression.Expression: return ops.where_op.as_expr(zero_result, divisor_is_zero, expr) -POLARS_LOWERING_RULES = (LowerFloorDivRule(),) +def _coerce_comparables(expr1: expression.Expression, expr2: expression.Expression): + + target_type = dtypes.coerce_to_common(expr1.output_type, expr2.output_type) + if expr1.output_type != target_type: + expr1 = _lower_cast(ops.AsTypeOp(target_type), expr1) + if expr2.output_type != target_type: + expr2 = _lower_cast(ops.AsTypeOp(target_type), expr2) + return expr1, expr2 + + +# TODO: Need to handle bool->string cast to get capitalization correct +def _lower_cast(cast_op: ops.AsTypeOp, arg: expression.Expression): + if arg.output_type == dtypes.BOOL_DTYPE and dtypes.is_numeric(cast_op.to_type): + # bool -> decimal needs two-step cast + new_arg = ops.AsTypeOp(to_type=dtypes.INT_DTYPE).as_expr(arg) + return cast_op.as_expr(new_arg) + return cast_op.as_expr(arg) + + +LOWER_COMPARISONS = tuple( + CoerceArgsRule(op) + for op in ( + comparison_ops.EqOp, + comparison_ops.EqNullsMatchOp, + comparison_ops.NeOp, + comparison_ops.LtOp, + comparison_ops.GtOp, + comparison_ops.LeOp, + comparison_ops.GeOp, + ) +) + +POLARS_LOWERING_RULES = ( + *LOWER_COMPARISONS, + LowerFloorDivRule(), +) def lower_ops_to_polars(root: bigframe_node.BigFrameNode) -> bigframe_node.BigFrameNode: diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index 075089bb7a..30da6b2cb2 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -1498,6 +1498,7 @@ def eq_op( x: ibis_types.Value, y: ibis_types.Value, ): + x, y = _coerce_comparables(x, y) return x == y @@ -1507,6 +1508,7 @@ def eq_nulls_match_op( y: ibis_types.Value, ): """Variant of eq_op where nulls match each other. Only use where dtypes are known to be same.""" + x, y = _coerce_comparables(x, y) literal = ibis_types.literal("$NULL_SENTINEL$") if hasattr(x, "fill_null"): left = x.cast(ibis_dtypes.str).fill_null(literal) @@ -1523,6 +1525,7 @@ def ne_op( x: ibis_types.Value, y: ibis_types.Value, ): + x, y = _coerce_comparables(x, y) return x != y @@ -1534,6 +1537,17 @@ def _null_or_value(value: ibis_types.Value, where_value: ibis_types.BooleanValue ) +def _coerce_comparables( + x: ibis_types.Value, + y: ibis_types.Value, +): + if x.type().is_boolean() and not y.type().is_boolean(): + x = x.cast(ibis_dtypes.int64) + elif y.type().is_boolean() and not x.type().is_boolean(): + y = y.cast(ibis_dtypes.int64) + return x, y + + @scalar_op_compiler.register_binary_op(ops.and_op) def and_op( x: ibis_types.Value, @@ -1735,6 +1749,7 @@ def lt_op( x: ibis_types.Value, y: ibis_types.Value, ): + x, y = _coerce_comparables(x, y) return x < y @@ -1744,6 +1759,7 @@ def le_op( x: ibis_types.Value, y: ibis_types.Value, ): + x, y = _coerce_comparables(x, y) return x <= y @@ -1753,6 +1769,7 @@ def gt_op( x: ibis_types.Value, y: ibis_types.Value, ): + x, y = _coerce_comparables(x, y) return x > y @@ -1762,6 +1779,7 @@ def ge_op( x: ibis_types.Value, y: ibis_types.Value, ): + x, y = _coerce_comparables(x, y) return x >= y diff --git a/bigframes/session/polars_executor.py b/bigframes/session/polars_executor.py index 24acda35dc..ec00e38606 100644 --- a/bigframes/session/polars_executor.py +++ b/bigframes/session/polars_executor.py @@ -32,11 +32,20 @@ nodes.OrderByNode, nodes.ReversedNode, nodes.SelectionNode, + nodes.ProjectionNode, nodes.SliceNode, nodes.AggregateNode, ) -_COMPATIBLE_SCALAR_OPS = () +_COMPATIBLE_SCALAR_OPS = ( + bigframes.operations.eq_op, + bigframes.operations.eq_null_match_op, + bigframes.operations.ne_op, + bigframes.operations.gt_op, + bigframes.operations.lt_op, + bigframes.operations.ge_op, + bigframes.operations.le_op, +) _COMPATIBLE_AGG_OPS = (agg_ops.SizeOp, agg_ops.SizeUnaryOp) diff --git a/tests/system/small/engines/test_comparison_ops.py b/tests/system/small/engines/test_comparison_ops.py new file mode 100644 index 0000000000..fefff93f58 --- /dev/null +++ b/tests/system/small/engines/test_comparison_ops.py @@ -0,0 +1,70 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import itertools + +import pytest + +from bigframes.core import array_value +import bigframes.operations as ops +from bigframes.session import polars_executor +from bigframes.testing.engine_utils import assert_equivalence_execution + +pytest.importorskip("polars") + +# Polars used as reference as its fast and local. Generally though, prefer gbq engine where they disagree. +REFERENCE_ENGINE = polars_executor.PolarsExecutor() + +# numeric domain + + +def apply_op_pairwise( + array: array_value.ArrayValue, op: ops.BinaryOp, excluded_cols=[] +) -> array_value.ArrayValue: + exprs = [] + for l_arg, r_arg in itertools.permutations(array.column_ids, 2): + if (l_arg in excluded_cols) or (r_arg in excluded_cols): + continue + try: + _ = op.output_type( + array.get_column_type(l_arg), array.get_column_type(r_arg) + ) + exprs.append(op.as_expr(l_arg, r_arg)) + except TypeError: + continue + assert len(exprs) > 0 + new_arr, _ = array.compute_values(exprs) + return new_arr + + +@pytest.mark.parametrize("engine", ["polars", "bq"], indirect=True) +@pytest.mark.parametrize( + "op", + [ + ops.eq_op, + ops.eq_null_match_op, + ops.ne_op, + ops.gt_op, + ops.lt_op, + ops.le_op, + ops.ge_op, + ], +) +def test_engines_project_comparison_op( + scalars_array_value: array_value.ArrayValue, engine, op +): + # exclude string cols as does not contain dates + # bool col actually doesn't work properly for bq engine + arr = apply_op_pairwise(scalars_array_value, op, excluded_cols=["string_col"]) + assert_equivalence_execution(arr.node, REFERENCE_ENGINE, engine) From 81e4d64c5a3bd8d30edaf909d0bef2d1d1a51c01 Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Mon, 30 Jun 2025 09:52:28 -0700 Subject: [PATCH 26/28] fix: Fix bug with DataFrame.agg for string values (#1870) --- bigframes/core/blocks.py | 19 +++++++----- bigframes/dataframe.py | 46 +++++++++++++++++++++++----- tests/system/small/test_dataframe.py | 34 +++++++++++++++++++- 3 files changed, 82 insertions(+), 17 deletions(-) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 312035c7e0..6d476cc795 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -2030,7 +2030,7 @@ def _generate_resample_label( return block.set_index([resample_label_id]) def _create_stack_column(self, col_label: typing.Tuple, stack_labels: pd.Index): - dtype = None + input_dtypes = [] input_columns: list[Optional[str]] = [] for uvalue in utils.index_as_tuples(stack_labels): label_to_match = (*col_label, *uvalue) @@ -2040,15 +2040,18 @@ def _create_stack_column(self, col_label: typing.Tuple, stack_labels: pd.Index): matching_ids = self.label_to_col_id.get(label_to_match, []) input_id = matching_ids[0] if len(matching_ids) > 0 else None if input_id: - if dtype and dtype != self._column_type(input_id): - raise NotImplementedError( - "Cannot stack columns with non-matching dtypes." - ) - else: - dtype = self._column_type(input_id) + input_dtypes.append(self._column_type(input_id)) input_columns.append(input_id) # Input column i is the first one that - return tuple(input_columns), dtype or pd.Float64Dtype() + if len(input_dtypes) > 0: + output_dtype = bigframes.dtypes.lcd_type(*input_dtypes) + if output_dtype is None: + raise NotImplementedError( + "Cannot stack columns with non-matching dtypes." + ) + else: + output_dtype = pd.Float64Dtype() + return tuple(input_columns), output_dtype def _column_type(self, col_id: str) -> bigframes.dtypes.Dtype: col_offset = self.value_columns.index(col_id) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 495e242f43..1ca5b8b035 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -3004,14 +3004,44 @@ def agg( if utils.is_dict_like(func): # Must check dict-like first because dictionaries are list-like # according to Pandas. - agg_cols = [] - for col_label, agg_func in func.items(): - agg_cols.append(self[col_label].agg(agg_func)) - - from bigframes.core.reshape import api as reshape - - return reshape.concat(agg_cols, axis=1) + aggs = [] + labels = [] + funcnames = [] + for col_label, agg_func in func.items(): + agg_func_list = agg_func if utils.is_list_like(agg_func) else [agg_func] + col_id = self._block.resolve_label_exact(col_label) + if col_id is None: + raise KeyError(f"Column {col_label} does not exist") + for agg_func in agg_func_list: + agg_op = agg_ops.lookup_agg_func(typing.cast(str, agg_func)) + agg_expr = ( + ex.UnaryAggregation(agg_op, ex.deref(col_id)) + if isinstance(agg_op, agg_ops.UnaryAggregateOp) + else ex.NullaryAggregation(agg_op) + ) + aggs.append(agg_expr) + labels.append(col_label) + funcnames.append(agg_func) + + # if any list in dict values, format output differently + if any(utils.is_list_like(v) for v in func.values()): + new_index, _ = self.columns.reindex(labels) + new_index = utils.combine_indices(new_index, pandas.Index(funcnames)) + agg_block, _ = self._block.aggregate( + aggregations=aggs, column_labels=new_index + ) + return DataFrame(agg_block).stack().droplevel(0, axis="index") + else: + new_index, _ = self.columns.reindex(labels) + agg_block, _ = self._block.aggregate( + aggregations=aggs, column_labels=new_index + ) + return bigframes.series.Series( + agg_block.transpose( + single_row_mode=True, original_row_index=pandas.Index([None]) + ) + ) elif utils.is_list_like(func): aggregations = [agg_ops.lookup_agg_func(f) for f in func] @@ -3027,7 +3057,7 @@ def agg( ) ) - else: + else: # function name string return bigframes.series.Series( self._block.aggregate_all_and_stack( agg_ops.lookup_agg_func(typing.cast(str, func)) diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index d5446efcd0..e8d156538f 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -5538,7 +5538,7 @@ def test_astype_invalid_type_fail(scalars_dfs): bf_df.astype(123) -def test_agg_with_dict(scalars_dfs): +def test_agg_with_dict_lists(scalars_dfs): bf_df, pd_df = scalars_dfs agg_funcs = { "int64_too": ["min", "max"], @@ -5553,6 +5553,38 @@ def test_agg_with_dict(scalars_dfs): ) +def test_agg_with_dict_list_and_str(scalars_dfs): + bf_df, pd_df = scalars_dfs + agg_funcs = { + "int64_too": ["min", "max"], + "int64_col": "sum", + } + + bf_result = bf_df.agg(agg_funcs).to_pandas() + pd_result = pd_df.agg(agg_funcs) + + pd.testing.assert_frame_equal( + bf_result, pd_result, check_dtype=False, check_index_type=False + ) + + +def test_agg_with_dict_strs(scalars_dfs): + bf_df, pd_df = scalars_dfs + agg_funcs = { + "int64_too": "min", + "int64_col": "sum", + "float64_col": "max", + } + + bf_result = bf_df.agg(agg_funcs).to_pandas() + pd_result = pd_df.agg(agg_funcs) + pd_result.index = pd_result.index.astype("string[pyarrow]") + + pd.testing.assert_series_equal( + bf_result, pd_result, check_dtype=False, check_index_type=False + ) + + def test_agg_with_dict_containing_non_existing_col_raise_key_error(scalars_dfs): bf_df, _ = scalars_dfs agg_funcs = { From bb981783d5124f0e1e0c80f7eb6416165264c0dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Mon, 30 Jun 2025 17:02:28 -0500 Subject: [PATCH 27/28] chore: fix `read_gbq_colab` benchmark (#1872) * chore: fix `read_gbq_colab` benchmark * Correct the table size to match actual percentiles. * Only do sum() on numeric columns. * fix for filter bench --- .../create_read_gbq_colab_benchmark_tables.py | 17 +++-------------- .../read_gbq_colab/aggregate_output.py | 2 +- tests/benchmark/read_gbq_colab/filter_output.py | 12 ++++++++++-- 3 files changed, 14 insertions(+), 17 deletions(-) diff --git a/scripts/create_read_gbq_colab_benchmark_tables.py b/scripts/create_read_gbq_colab_benchmark_tables.py index 703c946360..63419bc660 100644 --- a/scripts/create_read_gbq_colab_benchmark_tables.py +++ b/scripts/create_read_gbq_colab_benchmark_tables.py @@ -42,18 +42,6 @@ 17486432.0, 1919625975.0, ], - "num_materialized_or_scanned_rows": [ - 0.0, - 6.0, - 100.0, - 4955.0, - 23108.0, - 139504.0, - 616341.0, - 3855698.0, - 83725698.0, - 5991998082.0, - ], "avg_row_bytes": [ 0.00014346299635435792, 0.005370969708923197, @@ -524,10 +512,11 @@ def main(): for i in range(num_percentiles): percentile = TABLE_STATS["percentile"][i] avg_row_bytes_raw = TABLE_STATS["avg_row_bytes"][i] - num_rows_raw = TABLE_STATS["num_materialized_or_scanned_rows"][i] + table_bytes_raw = TABLE_STATS["materialized_or_scanned_bytes"][i] + target_table_bytes = max(1, int(math.ceil(table_bytes_raw))) target_row_bytes = max(1, int(math.ceil(avg_row_bytes_raw))) - num_rows = max(1, int(math.ceil(num_rows_raw))) + num_rows = max(1, int(math.ceil(target_table_bytes / target_row_bytes))) table_name = f"percentile_{percentile:02d}" print(f"\n--- Processing Table: {table_name} ---") diff --git a/tests/benchmark/read_gbq_colab/aggregate_output.py b/tests/benchmark/read_gbq_colab/aggregate_output.py index b612e2998c..dda4bf95a4 100644 --- a/tests/benchmark/read_gbq_colab/aggregate_output.py +++ b/tests/benchmark/read_gbq_colab/aggregate_output.py @@ -44,7 +44,7 @@ def aggregate_output( df_aggregated = ( df.assign(rounded=df[group_column].astype("Int64").round(-9)) .groupby("rounded") - .sum() + .sum(numeric_only=True) ) df_aggregated.shape diff --git a/tests/benchmark/read_gbq_colab/filter_output.py b/tests/benchmark/read_gbq_colab/filter_output.py index 7945d9f0c6..5e872bb727 100644 --- a/tests/benchmark/read_gbq_colab/filter_output.py +++ b/tests/benchmark/read_gbq_colab/filter_output.py @@ -14,6 +14,7 @@ import pathlib import benchmark.utils as utils +import pytest import bigframes.session @@ -35,8 +36,15 @@ def filter_output( # Simulate the user filtering by a column and visualizing those results df_filtered = df[df["col_bool_0"]] - df_filtered.shape - next(iter(df_filtered.to_pandas_batches(page_size=PAGE_SIZE))) + rows, _ = df_filtered.shape + + # It's possible we don't have any pages at all, since we filtered out all + # matching rows. + if rows == 0: + with pytest.raises(StopIteration): + next(iter(df_filtered.to_pandas_batches(page_size=PAGE_SIZE))) + else: + next(iter(df_filtered.to_pandas_batches(page_size=PAGE_SIZE))) if __name__ == "__main__": From a4682e919e00c50e75ad57d273edfbc8b3bb78bd Mon Sep 17 00:00:00 2001 From: "release-please[bot]" <55107282+release-please[bot]@users.noreply.github.com> Date: Tue, 1 Jul 2025 09:23:25 -0700 Subject: [PATCH 28/28] chore(main): release 2.9.0 (#1845) Co-authored-by: release-please[bot] <55107282+release-please[bot]@users.noreply.github.com> --- CHANGELOG.md | 28 +++++++++++++++++++++++ bigframes/version.py | 4 ++-- third_party/bigframes_vendored/version.py | 4 ++-- 3 files changed, 32 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f649f2f8a4..313064241d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,34 @@ [1]: https://pypi.org/project/bigframes/#history +## [2.9.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v2.8.0...v2.9.0) (2025-06-30) + + +### Features + +* Add `bpd.read_arrow` to convert an Arrow object into a bigframes DataFrame ([#1855](https://github.com/googleapis/python-bigquery-dataframes/issues/1855)) ([633bf98](https://github.com/googleapis/python-bigquery-dataframes/commit/633bf98fde33264be4fc9d7454e541c560589152)) +* Add experimental polars execution ([#1747](https://github.com/googleapis/python-bigquery-dataframes/issues/1747)) ([daf0c3b](https://github.com/googleapis/python-bigquery-dataframes/commit/daf0c3b349fb1e85e7070c54a2d3f5460f5e40c9)) +* Add size op support in local engine ([#1865](https://github.com/googleapis/python-bigquery-dataframes/issues/1865)) ([942e66c](https://github.com/googleapis/python-bigquery-dataframes/commit/942e66c483c9afbb680a7af56c9e9a76172a33e1)) +* Create `deploy_remote_function` and `deploy_udf` functions to immediately deploy functions to BigQuery ([#1832](https://github.com/googleapis/python-bigquery-dataframes/issues/1832)) ([c706759](https://github.com/googleapis/python-bigquery-dataframes/commit/c706759b85359b6d23ce3449f6ab138ad2d22f9d)) +* Support index item assign in Series ([#1868](https://github.com/googleapis/python-bigquery-dataframes/issues/1868)) ([c5d251a](https://github.com/googleapis/python-bigquery-dataframes/commit/c5d251a1d454bb4ef55ea9905faeadd646a23b14)) +* Support item assignment in series ([#1859](https://github.com/googleapis/python-bigquery-dataframes/issues/1859)) ([25684ff](https://github.com/googleapis/python-bigquery-dataframes/commit/25684ff60367f49dd318d4677a7438abdc98bff9)) +* Support local execution of comparison ops ([#1849](https://github.com/googleapis/python-bigquery-dataframes/issues/1849)) ([1c45ccb](https://github.com/googleapis/python-bigquery-dataframes/commit/1c45ccb133091aa85bc34450704fc8cab3d9296b)) + + +### Bug Fixes + +* Fix bug selecting column repeatedly ([#1858](https://github.com/googleapis/python-bigquery-dataframes/issues/1858)) ([cc339e9](https://github.com/googleapis/python-bigquery-dataframes/commit/cc339e9938129cac896460e3a794b3ec8479fa4a)) +* Fix bug with DataFrame.agg for string values ([#1870](https://github.com/googleapis/python-bigquery-dataframes/issues/1870)) ([81e4d64](https://github.com/googleapis/python-bigquery-dataframes/commit/81e4d64c5a3bd8d30edaf909d0bef2d1d1a51c01)) +* Generate GoogleSQL instead of legacy SQL data types for `dry_run=True` from `bpd._read_gbq_colab` with local pandas DataFrame ([#1867](https://github.com/googleapis/python-bigquery-dataframes/issues/1867)) ([fab3c38](https://github.com/googleapis/python-bigquery-dataframes/commit/fab3c387b2ad66043244fa813a366e613b41c60f)) +* Revert dict back to protobuf in the iam binding update ([#1838](https://github.com/googleapis/python-bigquery-dataframes/issues/1838)) ([9fb3cb4](https://github.com/googleapis/python-bigquery-dataframes/commit/9fb3cb444607df6736d383a2807059bca470c453)) + + +### Documentation + +* Add data visualization samples for public doc ([#1847](https://github.com/googleapis/python-bigquery-dataframes/issues/1847)) ([15e1277](https://github.com/googleapis/python-bigquery-dataframes/commit/15e1277b1413de18a5e36f72959a99701d6df08b)) +* Changed broken logo ([#1866](https://github.com/googleapis/python-bigquery-dataframes/issues/1866)) ([e3c06b4](https://github.com/googleapis/python-bigquery-dataframes/commit/e3c06b4a07d0669a42460d081f1582b681ae3dd5)) +* Update ai.forecast notebook ([#1844](https://github.com/googleapis/python-bigquery-dataframes/issues/1844)) ([1863538](https://github.com/googleapis/python-bigquery-dataframes/commit/186353888db537b561ee994256f998df361b4071)) + ## [2.8.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v2.7.0...v2.8.0) (2025-06-23) diff --git a/bigframes/version.py b/bigframes/version.py index 5d2de2f97f..4f3c9a5124 100644 --- a/bigframes/version.py +++ b/bigframes/version.py @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "2.8.0" +__version__ = "2.9.0" # {x-release-please-start-date} -__release_date__ = "2025-06-23" +__release_date__ = "2025-06-30" # {x-release-please-end} diff --git a/third_party/bigframes_vendored/version.py b/third_party/bigframes_vendored/version.py index 5d2de2f97f..4f3c9a5124 100644 --- a/third_party/bigframes_vendored/version.py +++ b/third_party/bigframes_vendored/version.py @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "2.8.0" +__version__ = "2.9.0" # {x-release-please-start-date} -__release_date__ = "2025-06-23" +__release_date__ = "2025-06-30" # {x-release-please-end}
\n", " \n", - " \"Colab Run in Colab\n", + " \"Colab Run in Colab\n", " \n", " \n", " \n", - " \"GitHub\n", + " \"GitHub\n", " View on GitHub\n", " \n", " \n", " \n", - " \"Colab Run in Colab\n", + " \"Colab Run in Colab\n", " \n", " \n", " \n", - " \"GitHub\n", + " \"GitHub\n", " View on GitHub\n", " \n", " \n", " \n", - " \"Colab Run in Colab\n", + " \"Colab Run in Colab\n", " \n", " \n", " \n", - " \"GitHub\n", + " \"GitHub\n", " View on GitHub\n", " \n", " \n", " \n", - " \"Colab Run in Colab\n", + " \"Colab Run in Colab\n", " \n", " \n", " \n", - " \"GitHub\n", + " \"GitHub\n", " View on GitHub\n", " \n", " \n", " \n", - " \"Colab Run in Colab\n", + " \"Colab Run in Colab\n", " \n", " \n", " \n", - " \"GitHub\n", + " \"GitHub\n", " View on GitHub\n", " \n", " \n", " \n", - " \"Colab Run in Colab\n", + " \"Colab Run in Colab\n", " \n", " \n", " \n", - " \"GitHub\n", + " \"GitHub\n", " View on GitHub\n", " \n", " \n", " \n", - " \"Colab Run in Colab\n", + " \"Colab Run in Colab\n", " \n", " \n", " \n", - " \"GitHub\n", + " \"GitHub\n", " View on GitHub\n", " \n", " \n", " \n", - " \"Colab Run in Colab\n", + " \"Colab Run in Colab\n", " \n", " \n", " \n", - " \"GitHub\n", + " \"GitHub\n", " View on GitHub\n", " \n", " \n", " \n", - " \"Colab Run in Colab\n", + " \"Colab Run in Colab\n", " \n", " \n", " \n", - " \"GitHub\n", + " \"GitHub\n", " View on GitHub\n", " \n", " \n", " \n", - " \"Colab Run in Colab\n", + " \"Colab Run in Colab\n", " \n", " \n", " \n", - " \"GitHub\n", + " \"GitHub\n", " View on GitHub\n", " \n", "