Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We鈥檒l occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: make pyarrow an optional dependency post-3.20.0 yanked release #1879

Merged
merged 4 commits into from
Mar 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
18 changes: 7 additions & 11 deletions google/cloud/bigquery/_pandas_helpers.py
Expand Up @@ -32,7 +32,7 @@
import pandas # type: ignore

pandas_import_exception = None
except ImportError as exc: # pragma: NO COVER
except ImportError as exc:
pandas = None
pandas_import_exception = exc
else:
Expand All @@ -44,25 +44,21 @@
date_dtype_name = db_dtypes.DateDtype.name
time_dtype_name = db_dtypes.TimeDtype.name
db_dtypes_import_exception = None
except ImportError as exc: # pragma: NO COVER
except ImportError as exc:
db_dtypes = None
db_dtypes_import_exception = exc
date_dtype_name = time_dtype_name = "" # Use '' rather than None because pytype

pyarrow = _versions_helpers.PYARROW_VERSIONS.try_import(raise_if_error=True)
from pyarrow import ArrowTypeError # type: ignore # noqa: E402

_BIGNUMERIC_SUPPORT = False
if pyarrow is not None: # pragma: NO COVER
_BIGNUMERIC_SUPPORT = True
pyarrow = _versions_helpers.PYARROW_VERSIONS.try_import()

try:
# _BaseGeometry is used to detect shapely objevys in `bq_to_arrow_array`
from shapely.geometry.base import BaseGeometry as _BaseGeometry # type: ignore
except ImportError: # pragma: NO COVER
except ImportError:
# No shapely, use NoneType for _BaseGeometry as a placeholder.
_BaseGeometry = type(None)
else:
# We don't have any unit test sessions that install shapely but not pandas.
if pandas is not None: # pragma: NO COVER

def _to_wkb():
Expand Down Expand Up @@ -309,10 +305,10 @@ def bq_to_arrow_array(series, bq_field):
if field_type_upper in schema._STRUCT_TYPES:
return pyarrow.StructArray.from_pandas(series, type=arrow_type)
return pyarrow.Array.from_pandas(series, type=arrow_type)
except ArrowTypeError: # pragma: NO COVER
except pyarrow.ArrowTypeError:
msg = f"""Error converting Pandas column with name: "{series.name}" and datatype: "{series.dtype}" to an appropriate pyarrow datatype: Array, ListArray, or StructArray"""
_LOGGER.error(msg)
raise ArrowTypeError(msg)
raise pyarrow.ArrowTypeError(msg)


def get_column_or_index(dataframe, name):
Expand Down
4 changes: 2 additions & 2 deletions google/cloud/bigquery/_pyarrow_helpers.py
Expand Up @@ -20,7 +20,7 @@

try:
import pyarrow # type: ignore
except ImportError: # pragma: NO COVER
except ImportError:
pyarrow = None


Expand Down Expand Up @@ -49,7 +49,7 @@ def pyarrow_timestamp():
_BQ_TO_ARROW_SCALARS = {}
_ARROW_SCALAR_IDS_TO_BQ = {}

if pyarrow: # pragma: NO COVER
if pyarrow:
# This dictionary is duplicated in bigquery_storage/test/unite/test_reader.py
# When modifying it be sure to update it there as well.
# Note(todo!!): type "BIGNUMERIC"'s matching pyarrow type is added in _pandas_helpers.py
Expand Down
13 changes: 8 additions & 5 deletions google/cloud/bigquery/_tqdm_helpers.py
Expand Up @@ -23,11 +23,14 @@

try:
import tqdm # type: ignore
import tqdm.notebook as notebook # type: ignore

except ImportError: # pragma: NO COVER
except ImportError:
tqdm = None

try:
import tqdm.notebook as tqdm_notebook # type: ignore
except ImportError:
tqdm_notebook = None

if typing.TYPE_CHECKING: # pragma: NO COVER
from google.cloud.bigquery import QueryJob
from google.cloud.bigquery.table import RowIterator
Expand All @@ -42,7 +45,7 @@

def get_progress_bar(progress_bar_type, description, total, unit):
"""Construct a tqdm progress bar object, if tqdm is installed."""
if tqdm is None:
if tqdm is None or tqdm_notebook is None and progress_bar_type == "tqdm_notebook":
if progress_bar_type is not None:
warnings.warn(_NO_TQDM_ERROR, UserWarning, stacklevel=3)
return None
Expand All @@ -58,7 +61,7 @@ def get_progress_bar(progress_bar_type, description, total, unit):
unit=unit,
)
elif progress_bar_type == "tqdm_notebook":
return notebook.tqdm(
return tqdm_notebook.tqdm(
bar_format="{l_bar}{bar}|",
desc=description,
file=sys.stdout,
Expand Down
4 changes: 2 additions & 2 deletions google/cloud/bigquery/_versions_helpers.py
Expand Up @@ -73,7 +73,7 @@ def try_import(self, raise_if_error: bool = False) -> Any:
"""
try:
import pyarrow
except ImportError as exc: # pragma: NO COVER
except ImportError as exc:
if raise_if_error:
raise exceptions.LegacyPyarrowError(
"pyarrow package not found. Install pyarrow version >="
Expand Down Expand Up @@ -212,7 +212,7 @@ def try_import(self, raise_if_error: bool = False) -> Any:
"""
try:
import pandas
except ImportError as exc: # pragma: NO COVER
except ImportError as exc:
if raise_if_error:
raise exceptions.LegacyPandasError(
"pandas package not found. Install pandas version >="
Expand Down
7 changes: 1 addition & 6 deletions google/cloud/bigquery/job/query.py
Expand Up @@ -56,14 +56,9 @@

try:
import pandas # type: ignore
except ImportError: # pragma: NO COVER
except ImportError:
pandas = None

try:
import db_dtypes # type: ignore
except ImportError: # pragma: NO COVER
db_dtypes = None

if typing.TYPE_CHECKING: # pragma: NO COVER
# Assumption: type checks are only used by library developers and CI environments
# that have all optional dependencies installed, thus no conditional imports.
Expand Down
2 changes: 1 addition & 1 deletion google/cloud/bigquery/magics/magics.py
Expand Up @@ -95,7 +95,7 @@
import IPython # type: ignore
from IPython import display # type: ignore
from IPython.core import magic_arguments # type: ignore
except ImportError: # pragma: NO COVER
except ImportError:
raise ImportError("This module can only be loaded in IPython.")

from google.api_core import client_info
Expand Down
6 changes: 3 additions & 3 deletions google/cloud/bigquery/table.py
Expand Up @@ -26,17 +26,17 @@

try:
import pandas # type: ignore
except ImportError: # pragma: NO COVER
except ImportError:
pandas = None

try:
import pyarrow # type: ignore
except ImportError: # pragma: NO COVER
except ImportError:
pyarrow = None

try:
import db_dtypes # type: ignore
except ImportError: # pragma: NO COVER
except ImportError:
db_dtypes = None

try:
Expand Down
15 changes: 8 additions & 7 deletions noxfile.py
Expand Up @@ -86,7 +86,7 @@ def default(session, install_extras=True):
install_target = ".[all]"
else:
install_target = "."
session.install("-e", install_target)
session.install("-e", install_target, "-c", constraints_path)
session.run("python", "-m", "pip", "freeze")

# Run py.test against the unit tests.
Expand Down Expand Up @@ -115,14 +115,15 @@ def unit(session):
def unit_noextras(session):
"""Run the unit test suite."""

# Install optional dependencies that are out-of-date.
# Install optional dependencies that are out-of-date to see that
# we fail gracefully.
# https://github.com/googleapis/python-bigquery/issues/933
# There is no pyarrow 1.0.0 package for Python 3.9.

#
# We only install this extra package on one of the two Python versions
# so that it continues to be an optional dependency.
# https://github.com/googleapis/python-bigquery/issues/1877
if session.python == UNIT_TEST_PYTHON_VERSIONS[0]:
session.install("pyarrow>=3.0.0")
elif session.python == UNIT_TEST_PYTHON_VERSIONS[-1]:
session.install("pyarrow")
session.install("pyarrow==1.0.0")

default(session, install_extras=False)

Expand Down
1 change: 0 additions & 1 deletion samples/desktopapp/requirements-test.txt
Expand Up @@ -2,4 +2,3 @@ google-cloud-testutils==1.4.0
pytest===7.4.4; python_version == '3.7'
pytest==8.1.1; python_version >= '3.8'
mock==5.1.0
pyarrow>=3.0.0
2 changes: 1 addition & 1 deletion samples/snippets/requirements-test.txt
@@ -1,5 +1,5 @@
# samples/snippets should be runnable with no "extras"
google-cloud-testutils==1.4.0
pytest===7.4.4; python_version == '3.7'
pytest==8.1.1; python_version >= '3.8'
mock==5.1.0
pyarrow>=3.0.0
3 changes: 2 additions & 1 deletion samples/snippets/requirements.txt
@@ -1 +1,2 @@
google-cloud-bigquery==3.19.0
# samples/snippets should be runnable with no "extras"
google-cloud-bigquery==3.19.0
1 change: 0 additions & 1 deletion testing/constraints-3.11.txt
@@ -1 +0,0 @@
pyarrow>=3.0.0
1 change: 0 additions & 1 deletion testing/constraints-3.12.txt
@@ -1 +0,0 @@
pyarrow>=3.0.0
4 changes: 2 additions & 2 deletions testing/constraints-3.7.txt
Expand Up @@ -27,9 +27,9 @@ packaging==20.0.0
pandas==1.1.0
proto-plus==1.22.0
protobuf==3.19.5
pyarrow>=3.0.0
pyarrow==3.0.0
python-dateutil==2.7.3
requests==2.21.0
Shapely==1.8.4
six==1.13.0
tqdm==4.7.4
tqdm==4.7.4
40 changes: 14 additions & 26 deletions tests/unit/job/test_query_pandas.py
Expand Up @@ -19,53 +19,38 @@

import pytest

from ..helpers import make_connection
from .helpers import _make_client
from .helpers import _make_job_resource

try:
from google.cloud import bigquery_storage
import google.cloud.bigquery_storage_v1.reader
import google.cloud.bigquery_storage_v1.services.big_query_read.client
except (ImportError, AttributeError): # pragma: NO COVER
except (ImportError, AttributeError):
bigquery_storage = None

try:
import pandas
except (ImportError, AttributeError): # pragma: NO COVER
pandas = None
try:
import shapely
except (ImportError, AttributeError): # pragma: NO COVER
except (ImportError, AttributeError):
shapely = None
try:
import geopandas
except (ImportError, AttributeError): # pragma: NO COVER
except (ImportError, AttributeError):
geopandas = None
try:
import tqdm
except (ImportError, AttributeError): # pragma: NO COVER
except (ImportError, AttributeError):
tqdm = None

try:
import importlib.metadata as metadata
except ImportError:
import importlib_metadata as metadata

from ..helpers import make_connection
from .helpers import _make_client
from .helpers import _make_job_resource

if pandas is not None:
PANDAS_INSTALLED_VERSION = metadata.version("pandas")
else:
PANDAS_INSTALLED_VERSION = "0.0.0"

pandas = pytest.importorskip("pandas")

try:
import pyarrow
import pyarrow.types
except ImportError: # pragma: NO COVER
except ImportError:
pyarrow = None

pandas = pytest.importorskip("pandas")


@pytest.fixture
def table_read_options_kwarg():
Expand Down Expand Up @@ -660,7 +645,10 @@ def test_to_dataframe_bqstorage_no_pyarrow_compression():
)


@pytest.mark.skipif(PANDAS_INSTALLED_VERSION[0:2] not in ["0.", "1."], reason="")
@pytest.mark.skipif(
pandas.__version__.startswith("2."),
reason="pandas 2.0 changes some default dtypes and we haven't update the test to account for those",
)
@pytest.mark.skipif(pyarrow is None, reason="Requires `pyarrow`")
def test_to_dataframe_column_dtypes():
from google.cloud.bigquery.job import QueryJob as target_class
Expand Down
15 changes: 8 additions & 7 deletions tests/unit/test__pandas_helpers.py
Expand Up @@ -30,12 +30,12 @@
import pandas
import pandas.api.types
import pandas.testing
except ImportError: # pragma: NO COVER
except ImportError:
pandas = None

try:
import geopandas
except ImportError: # pragma: NO COVER
except ImportError:
geopandas = None

import pytest
Expand All @@ -46,18 +46,19 @@
from google.cloud.bigquery import _pyarrow_helpers
from google.cloud.bigquery import _versions_helpers
from google.cloud.bigquery import schema
from google.cloud.bigquery._pandas_helpers import _BIGNUMERIC_SUPPORT

pyarrow = _versions_helpers.PYARROW_VERSIONS.try_import()

if pyarrow:
import pyarrow.parquet
import pyarrow.types
from pyarrow import ArrowTypeError # type: ignore # noqa: E402
else: # pragma: NO COVER

_BIGNUMERIC_SUPPORT = True
else:
# Mock out pyarrow when missing, because methods from pyarrow.types are
# used in test parameterization.
pyarrow = mock.Mock()
_BIGNUMERIC_SUPPORT = False

bigquery_storage = _versions_helpers.BQ_STORAGE_VERSIONS.try_import()

Expand Down Expand Up @@ -572,9 +573,9 @@ def test_bq_to_arrow_array_w_conversion_fail(module_under_test): # pragma: NO C
series = pandas.Series(rows, name="test_col", dtype="object")
bq_field = schema.SchemaField("field_name", "STRING", mode="REPEATED")
exc_msg = f"""Error converting Pandas column with name: "{series.name}" and datatype: "{series.dtype}" to an appropriate pyarrow datatype: Array, ListArray, or StructArray"""
with pytest.raises(ArrowTypeError, match=exc_msg):
with pytest.raises(pyarrow.ArrowTypeError, match=exc_msg):
module_under_test.bq_to_arrow_array(series, bq_field)
raise ArrowTypeError(exc_msg)
raise pyarrow.ArrowTypeError(exc_msg)


@pytest.mark.parametrize("bq_type", ["RECORD", "record", "STRUCT", "struct"])
Expand Down