Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: update error logging when converting to pyarrow column fails #1836

Merged
merged 31 commits into from
Mar 19, 2024
Merged
Show file tree
Hide file tree
Changes from 23 commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
0b41532
fix: update error logging when converting to pyarrow column fails
chalmerlowe Feb 29, 2024
f7f0501
🦉 Updates from OwlBot post-processor
gcf-owl-bot[bot] Feb 29, 2024
0d543e4
resolve merge conflict
chalmerlowe Feb 29, 2024
fe5082b
resolve missing dependency
chalmerlowe Feb 29, 2024
e396b01
more tweaks to constraints and requirements re pyarrow
chalmerlowe Feb 29, 2024
0d25ca4
even more tweaks to constraints and requirements re pyarrow
chalmerlowe Feb 29, 2024
71b108f
a few more tweaks to constraints and requirements re pyarrow
chalmerlowe Feb 29, 2024
ead177d
Merge branch 'main' into name-series-in-error-log
chalmerlowe Mar 8, 2024
3220881
Merge branch 'main' into name-series-in-error-log
chalmerlowe Mar 11, 2024
1030756
Merge branch 'main' into name-series-in-error-log
chalmerlowe Mar 12, 2024
55a97ac
resolves issue of pyarrow not installing
chalmerlowe Mar 13, 2024
aeb0739
fix linting issue
chalmerlowe Mar 13, 2024
c262d8c
update linting and conditionals
chalmerlowe Mar 13, 2024
91913b0
update linting and mypy comments
chalmerlowe Mar 13, 2024
e39fe1d
quick tags on several coverage issues related to imports
chalmerlowe Mar 13, 2024
6975ce3
adds pragma to exception
chalmerlowe Mar 14, 2024
17d63be
updates test suite with new test and makes msg explicit
chalmerlowe Mar 15, 2024
f921c07
temporarily adding timing code
chalmerlowe Mar 15, 2024
2186d24
additional timing test mods
chalmerlowe Mar 15, 2024
6cefc01
add pragmas to account for several tests
chalmerlowe Mar 18, 2024
747beb6
Merge branch 'main' into name-series-in-error-log
chalmerlowe Mar 18, 2024
2a49bd5
cleaned up some test code
chalmerlowe Mar 18, 2024
caa0256
cleaned up some test code
chalmerlowe Mar 18, 2024
761b64d
Update a test to include column datatype
chalmerlowe Mar 18, 2024
4be910c
update to pytest.raises command
chalmerlowe Mar 18, 2024
43de81d
Update tests/unit/test__pandas_helpers.py
chalmerlowe Mar 19, 2024
bad4c0c
Merge branch 'main' into name-series-in-error-log
chalmerlowe Mar 19, 2024
0f71762
🦉 Updates from OwlBot post-processor
gcf-owl-bot[bot] Mar 19, 2024
0b786ba
🦉 Updates from OwlBot post-processor
gcf-owl-bot[bot] Mar 19, 2024
73066a4
Merge branch 'name-series-in-error-log' of https://github.com/googlea…
gcf-owl-bot[bot] Mar 19, 2024
2cca046
removed unused variable 'e'
chalmerlowe Mar 19, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
20 changes: 13 additions & 7 deletions google/cloud/bigquery/_pandas_helpers.py
Expand Up @@ -49,10 +49,11 @@
db_dtypes_import_exception = exc
date_dtype_name = time_dtype_name = "" # Use '' rather than None because pytype

pyarrow = _versions_helpers.PYARROW_VERSIONS.try_import()
pyarrow = _versions_helpers.PYARROW_VERSIONS.try_import(raise_if_error=True)
from pyarrow import ArrowTypeError # type: ignore # noqa: E402

_BIGNUMERIC_SUPPORT = False
if pyarrow is not None:
if pyarrow is not None: # pragma: NO COVER
_BIGNUMERIC_SUPPORT = True

try:
Expand Down Expand Up @@ -302,11 +303,16 @@ def bq_to_arrow_array(series, bq_field):

field_type_upper = bq_field.field_type.upper() if bq_field.field_type else ""

if bq_field.mode.upper() == "REPEATED":
return pyarrow.ListArray.from_pandas(series, type=arrow_type)
if field_type_upper in schema._STRUCT_TYPES:
return pyarrow.StructArray.from_pandas(series, type=arrow_type)
return pyarrow.Array.from_pandas(series, type=arrow_type)
try:
if bq_field.mode.upper() == "REPEATED":
return pyarrow.ListArray.from_pandas(series, type=arrow_type)
if field_type_upper in schema._STRUCT_TYPES:
return pyarrow.StructArray.from_pandas(series, type=arrow_type)
return pyarrow.Array.from_pandas(series, type=arrow_type)
except ArrowTypeError: # pragma: NO COVER
chalmerlowe marked this conversation as resolved.
Show resolved Hide resolved
msg = f"""Error converting Pandas column with name: "{series.name}" to pyarrow datatype: Array, ListArray, or StructArray"""
chalmerlowe marked this conversation as resolved.
Show resolved Hide resolved
_LOGGER.error(msg)
raise ArrowTypeError(msg)


def get_column_or_index(dataframe, name):
Expand Down
2 changes: 1 addition & 1 deletion google/cloud/bigquery/_pyarrow_helpers.py
Expand Up @@ -49,7 +49,7 @@ def pyarrow_timestamp():
_BQ_TO_ARROW_SCALARS = {}
_ARROW_SCALAR_IDS_TO_BQ = {}

if pyarrow:
if pyarrow: # pragma: NO COVER
chalmerlowe marked this conversation as resolved.
Show resolved Hide resolved
# This dictionary is duplicated in bigquery_storage/test/unite/test_reader.py
# When modifying it be sure to update it there as well.
# Note(todo!!): type "BIGNUMERIC"'s matching pyarrow type is added in _pandas_helpers.py
Expand Down
15 changes: 11 additions & 4 deletions noxfile.py
Expand Up @@ -18,7 +18,6 @@
import os
import re
import shutil

import nox


Expand Down Expand Up @@ -66,6 +65,7 @@ def default(session, install_extras=True):
Python corresponding to the ``nox`` binary the ``PATH`` can
run the tests.
"""

constraints_path = str(
CURRENT_DIRECTORY / "testing" / f"constraints-{session.python}.txt"
)
Expand All @@ -86,8 +86,7 @@ def default(session, install_extras=True):
install_target = ".[all]"
else:
install_target = "."
session.install("-e", install_target, "-c", constraints_path)

session.install("-e", install_target)
session.run("python", "-m", "pip", "freeze")

# Run py.test against the unit tests.
Expand All @@ -108,6 +107,7 @@ def default(session, install_extras=True):
@nox.session(python=UNIT_TEST_PYTHON_VERSIONS)
def unit(session):
"""Run the unit test suite."""

default(session)


Expand All @@ -118,15 +118,19 @@ def unit_noextras(session):
# Install optional dependencies that are out-of-date.
# https://github.com/googleapis/python-bigquery/issues/933
# There is no pyarrow 1.0.0 package for Python 3.9.

if session.python == UNIT_TEST_PYTHON_VERSIONS[0]:
session.install("pyarrow==1.0.0")
session.install("pyarrow>=3.0.0")
elif session.python == UNIT_TEST_PYTHON_VERSIONS[-1]:
session.install("pyarrow")

default(session, install_extras=False)


@nox.session(python=DEFAULT_PYTHON_VERSION)
def mypy(session):
"""Run type checks with mypy."""

session.install("-e", ".[all]")
session.install(MYPY_VERSION)

Expand All @@ -147,6 +151,7 @@ def pytype(session):
# An indirect dependecy attrs==21.1.0 breaks the check, and installing a less
# recent version avoids the error until a possibly better fix is found.
# https://github.com/googleapis/python-bigquery/issues/655

session.install("attrs==20.3.0")
session.install("-e", ".[all]")
session.install(PYTYPE_VERSION)
Expand Down Expand Up @@ -206,6 +211,7 @@ def system(session):
@nox.session(python=DEFAULT_PYTHON_VERSION)
def mypy_samples(session):
"""Run type checks with mypy."""

session.install("pytest")
for requirements_path in CURRENT_DIRECTORY.glob("samples/*/requirements.txt"):
session.install("-r", str(requirements_path))
Expand Down Expand Up @@ -283,6 +289,7 @@ def cover(session):
This outputs the coverage report aggregating coverage from the unit
test runs (not system test runs), and then erases coverage data.
"""

session.install("coverage", "pytest-cov")
session.run("coverage", "report", "--show-missing", "--fail-under=100")
session.run("coverage", "erase")
Expand Down
1 change: 1 addition & 0 deletions samples/desktopapp/requirements-test.txt
Expand Up @@ -2,3 +2,4 @@ google-cloud-testutils==1.4.0
pytest===7.4.4; python_version == '3.7'
pytest==8.1.1; python_version >= '3.8'
mock==5.1.0
pyarrow>=3.0.0
1 change: 1 addition & 0 deletions samples/snippets/requirements-test.txt
Expand Up @@ -2,3 +2,4 @@ google-cloud-testutils==1.4.0
pytest===7.4.4; python_version == '3.7'
pytest==8.1.1; python_version >= '3.8'
mock==5.1.0
pyarrow>=3.0.0
1 change: 1 addition & 0 deletions testing/constraints-3.11.txt
@@ -0,0 +1 @@
pyarrow>=3.0.0
1 change: 1 addition & 0 deletions testing/constraints-3.12.txt
@@ -0,0 +1 @@
pyarrow>=3.0.0
2 changes: 1 addition & 1 deletion testing/constraints-3.7.txt
Expand Up @@ -27,7 +27,7 @@ packaging==20.0.0
pandas==1.1.0
proto-plus==1.22.0
protobuf==3.19.5
pyarrow==3.0.0
pyarrow>=3.0.0
python-dateutil==2.7.3
requests==2.21.0
Shapely==1.8.4
Expand Down
19 changes: 17 additions & 2 deletions tests/unit/test__pandas_helpers.py
Expand Up @@ -53,6 +53,7 @@
if pyarrow:
import pyarrow.parquet
import pyarrow.types
from pyarrow import ArrowTypeError # type: ignore # noqa: E402
else: # pragma: NO COVER
# Mock out pyarrow when missing, because methods from pyarrow.types are
# used in test parameterization.
Expand Down Expand Up @@ -557,13 +558,27 @@ def test_bq_to_arrow_array_w_pandas_timestamp(module_under_test, bq_type, rows):
@pytest.mark.skipif(isinstance(pyarrow, mock.Mock), reason="Requires `pyarrow`")
def test_bq_to_arrow_array_w_arrays(module_under_test):
rows = [[1, 2, 3], [], [4, 5, 6]]
series = pandas.Series(rows, dtype="object")
series = pandas.Series(rows, name="test_col", dtype="object")
bq_field = schema.SchemaField("field_name", "INTEGER", mode="REPEATED")
arrow_array = module_under_test.bq_to_arrow_array(series, bq_field)
roundtrip = arrow_array.to_pylist()
assert rows == roundtrip


@pytest.mark.skipif(pandas is None, reason="Requires `pandas`")
@pytest.mark.skipif(pyarrow is None, reason="Requires `pyarrow`")
def test_bq_to_arrow_array_w_conversion_fail(module_under_test):
chalmerlowe marked this conversation as resolved.
Show resolved Hide resolved
rows = [[1, 2, 3], [], [4, 5, 6]]
series = pandas.Series(rows, name="test_col", dtype="object")
bq_field = schema.SchemaField("field_name", "STRING", mode="REPEATED")
with pytest.raises(ArrowTypeError) as e:
module_under_test.bq_to_arrow_array(series, bq_field)
assert (
e.exconly()
== f"""pyarrow.lib.ArrowTypeError: Error converting Pandas column with name: "{series.name}" to pyarrow datatype: Array, ListArray, or StructArray"""
)


@pytest.mark.parametrize("bq_type", ["RECORD", "record", "STRUCT", "struct"])
@pytest.mark.skipif(pandas is None, reason="Requires `pandas`")
@pytest.mark.skipif(isinstance(pyarrow, mock.Mock), reason="Requires `pyarrow`")
Expand All @@ -573,7 +588,7 @@ def test_bq_to_arrow_array_w_structs(module_under_test, bq_type):
None,
{"int_col": 456, "string_col": "def"},
]
series = pandas.Series(rows, dtype="object")
series = pandas.Series(rows, name="test_col", dtype="object")
bq_field = schema.SchemaField(
"field_name",
bq_type,
Expand Down
4 changes: 2 additions & 2 deletions tests/unit/test_table.py
Expand Up @@ -49,7 +49,7 @@

pyarrow = _versions_helpers.PYARROW_VERSIONS.try_import()

if pyarrow:
if pyarrow: # pragma: NO COVER
import pyarrow.types

try:
Expand Down Expand Up @@ -3743,7 +3743,7 @@ def test_to_dataframe_w_dtypes_mapper(self):
if hasattr(pandas, "Float64Dtype"):
self.assertEqual(list(df.miles), [1.77, 6.66, 2.0])
self.assertEqual(df.miles.dtype.name, "Float64")
else:
else: # pragma: NO COVER
self.assertEqual(list(df.miles), ["1.77", "6.66", "2.0"])
self.assertEqual(df.miles.dtype.name, "string")

Expand Down