Skip to content

Commit 6d554ce

Browse files
authored
Merge pull request #1563 from Martozar/c.mze-cq-105
feat(gooddata-sdk): handle datetime granularity and support bytes limits in arrow fetch
2 parents 4cb8139 + add06e9 commit 6d554ce

7 files changed

Lines changed: 811 additions & 52 deletions

File tree

packages/gooddata-pandas/src/gooddata_pandas/arrow_convertor.py

Lines changed: 96 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
11
# (C) 2026 GoodData Corporation
22
from __future__ import annotations
33

4+
import logging
45
from typing import Callable
56

67
import orjson
78
import pandas
9+
from gooddata_sdk.type_converter import AttributeConverterStore
810

911
from gooddata_pandas.arrow_types import TypesMapper
1012

@@ -42,6 +44,63 @@
4244

4345
_REQUIRED_SCHEMA_KEYS = (_META_XTAB, _META_MODEL, _META_VIEW)
4446

47+
logger = logging.getLogger(__name__)
48+
49+
50+
def read_model_labels(table: pa.Table) -> dict:
51+
"""Return the ``labels`` dict from the Arrow table's ``x-gdc-model-v1`` schema metadata.
52+
53+
Returns an empty dict when the metadata key is absent so callers can use it
54+
unconditionally without extra None-checks.
55+
"""
56+
if not table.schema.metadata or b"x-gdc-model-v1" not in table.schema.metadata:
57+
return {}
58+
return orjson.loads(table.schema.metadata[b"x-gdc-model-v1"]).get("labels", {})
59+
60+
61+
def _get_date_converter_for_label(label_id: str, model_labels: dict):
62+
"""Return a type Converter for date-granularity labels, or None for plain text attributes.
63+
64+
Reads the ``granularity`` field from Arrow model metadata (``x-gdc-model-v1``) and
65+
looks up the matching converter in ``AttributeConverterStore``.
66+
67+
- ``DAY`` / ``MONTH`` / ``YEAR`` → ``DateConverter`` (→ ``pandas.Timestamp`` via external fn)
68+
- ``WEEK`` / ``QUARTER`` → ``StringConverter`` (no-op)
69+
- ``MINUTE`` / ``HOUR`` → ``DatetimeConverter``
70+
- No granularity (text attrs) → ``None`` (caller skips conversion)
71+
"""
72+
info = model_labels.get(label_id, {})
73+
granularity = info.get("granularity")
74+
if not granularity:
75+
return None
76+
return AttributeConverterStore.find_converter("DATE", granularity.upper())
77+
78+
79+
def convert_label_values(label_id: str, values: list, model_labels: dict) -> list:
80+
"""Apply date-granularity type conversion to a list of attribute values from an Arrow column.
81+
82+
Mirrors the non-Arrow execution path (``AttributeConverterStore`` in ``_typed_attribute_value``):
83+
84+
- ``DAY`` / ``MONTH`` / ``YEAR`` granularity → ``pandas.Timestamp``
85+
- ``WEEK`` / ``QUARTER`` → ``str`` (unchanged)
86+
- No granularity (text attributes) → values returned as the **same object**
87+
88+
``None`` values are passed through unchanged.
89+
90+
Args:
91+
label_id: Arrow column name / GoodData label local ID.
92+
values: Raw values from ``table.column(label_id).to_pylist()``.
93+
model_labels: The ``labels`` dict from ``x-gdc-model-v1`` schema metadata
94+
(as returned by :func:`read_model_labels`).
95+
96+
Returns:
97+
Converted list, or the original *values* object when no conversion is needed.
98+
"""
99+
converter = _get_date_converter_for_label(label_id, model_labels)
100+
if converter is None:
101+
return values
102+
return [converter.to_external_type(v) if v is not None else None for v in values]
103+
45104

46105
def build_metric_field_index(table: pa.Table) -> dict[int, str]:
47106
"""Return {metric_dimension_index: arrow_field_name} from the table schema.
@@ -77,9 +136,14 @@ def _parse_schema_metadata(table: pa.Table) -> dict:
77136
raise ValueError(
78137
"Arrow table has no schema metadata. Expected GoodData metadata keys: " + ", ".join(_REQUIRED_SCHEMA_KEYS)
79138
)
80-
schema_meta = {
81-
k.decode(): orjson.loads(v) for k, v in table.schema.metadata.items() if k.decode() in _REQUIRED_SCHEMA_KEYS
82-
}
139+
schema_meta = {}
140+
for _k, _v in table.schema.metadata.items():
141+
try:
142+
_k_str = _k.decode()
143+
except UnicodeDecodeError:
144+
continue
145+
if _k_str in _REQUIRED_SCHEMA_KEYS:
146+
schema_meta[_k_str] = orjson.loads(_v)
83147
missing = [k for k in _REQUIRED_SCHEMA_KEYS if k not in schema_meta]
84148
if missing:
85149
raise ValueError(
@@ -186,10 +250,15 @@ def _build_inline_index(
186250
totals_meta = xtab_meta.get("totalsMetadata", {})
187251
total_ref_vals: list = [None] * table.num_rows
188252
if totals_meta:
189-
for field in table.schema:
190-
if field.name.startswith(_COL_TOTAL_REF_PREFIX):
191-
total_ref_vals = table.column(field.name).to_pylist()
192-
break
253+
total_ref_cols = [f.name for f in table.schema if f.name.startswith(_COL_TOTAL_REF_PREFIX)]
254+
if total_ref_cols:
255+
if len(total_ref_cols) > 1:
256+
logger.warning(
257+
"Arrow table has %d __total_ref* columns; only %r is used for aggregation names.",
258+
len(total_ref_cols),
259+
total_ref_cols[0],
260+
)
261+
total_ref_vals = table.column(total_ref_cols[0]).to_pylist()
193262

194263
# Precompute per-row aggregation name and kept-label set for total rows.
195264
agg_for_row: list[str | None] = [None] * table.num_rows
@@ -212,16 +281,17 @@ def _build_inline_index(
212281
values = table.column(lid).to_pylist()
213282
processed = []
214283
for i, v in enumerate(values):
215-
if row_types[i] != 0 and isinstance(v, str):
216-
if ref in kept_labels_for_row[i]:
217-
# Outer label kept as real attribute value in a subtotal row.
218-
processed.append(v)
219-
elif v == "":
220-
# Aggregated level left empty by the server — fill with agg name.
221-
processed.append(agg_for_row[i] if agg_for_row[i] else v)
284+
if row_types[i] != 0:
285+
if isinstance(v, str):
286+
if ref in kept_labels_for_row[i]:
287+
processed.append(v)
288+
elif v == "":
289+
processed.append(agg_for_row[i] if agg_for_row[i] else v)
290+
else:
291+
processed.append(v.upper())
222292
else:
223-
# Aggregation function marker (e.g. 'sum') — uppercase it.
224-
processed.append(v.upper())
293+
# Non-string value in a total row — replace with the aggregation name when available.
294+
processed.append(agg_for_row[i] if agg_for_row[i] is not None else v)
225295
else:
226296
processed.append(v)
227297
arrays.append(processed)
@@ -410,6 +480,11 @@ def _label_ids_in_dim(dim: dict) -> set:
410480
(dim for dim in execution_dims if col_ref_label_ids <= _label_ids_in_dim(dim)),
411481
{},
412482
)
483+
if not col_dim and execution_dims:
484+
logger.warning(
485+
"No execution dimension contains column label IDs %s; column_totals_indexes will be empty.",
486+
col_ref_label_ids,
487+
)
413488
else:
414489
col_dim = next(
415490
(dim for dim in execution_dims if any("measureGroupHeaders" in h for h in dim.get("headers", []))),
@@ -486,6 +561,11 @@ def _label_ids_in_dim(dim: dict) -> set:
486561
(dim for dim in execution_dims if ref_label_ids <= _label_ids_in_dim(dim)),
487562
{},
488563
)
564+
if not row_dim and execution_dims:
565+
logger.warning(
566+
"No execution dimension contains row label IDs %s; row_totals_indexes will be empty.",
567+
ref_label_ids,
568+
)
489569
else:
490570
# Metrics-only: the dimension containing measureGroupHeaders is the output-row dim.
491571
row_dim = next(

packages/gooddata-pandas/src/gooddata_pandas/arrow_types.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,8 +47,12 @@ class ArrowConfig:
4747
custom_mapping is not provided.
4848
custom_mapping: Arrow type → pandas dtype mapping dict. Only used when
4949
types_mapper=TypesMapper.CUSTOM, ignored otherwise.
50+
max_bytes: Optional byte-size limit for the Arrow response body. When set,
51+
``read_result_arrow`` raises ``ResultSizeBytesLimitExceeded`` if the
52+
raw IPC payload exceeds this value before parsing begins.
5053
"""
5154

5255
self_destruct: bool = False
5356
types_mapper: TypesMapper = TypesMapper.DEFAULT
5457
custom_mapping: dict | None = field(default=None)
58+
max_bytes: int | None = field(default=None)

packages/gooddata-pandas/src/gooddata_pandas/data_access.py

Lines changed: 26 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,6 @@
1818
)
1919
from gooddata_sdk.utils import IdObjType
2020

21-
try:
22-
from gooddata_pandas.arrow_convertor import build_metric_field_index
23-
except ImportError:
24-
pass # Only needed when use_arrow=True; callers guard with _ARROW_AVAILABLE checks
25-
2621
from gooddata_pandas.utils import (
2722
ColumnsDef,
2823
IndexDef,
@@ -34,6 +29,12 @@
3429
get_catalog_attributes_for_extract,
3530
)
3631

32+
_ARROW_IMPORT_ERROR: ImportError | None = None
33+
try:
34+
from gooddata_pandas.arrow_convertor import build_metric_field_index, convert_label_values, read_model_labels
35+
except ImportError as _e: # pragma: no cover
36+
_ARROW_IMPORT_ERROR = _e # pragma: no cover
37+
3738

3839
class ExecutionDefinitionBuilder:
3940
_DEFAULT_INDEX_NAME: str = "0"
@@ -429,20 +430,30 @@ def _extract_from_arrow(
429430
col_to_attr_idx: dict[str, int],
430431
col_to_metric_idx: dict[str, int],
431432
index_to_attr_idx: dict[str, int],
433+
max_bytes: int | None = None,
432434
) -> tuple[dict, dict]:
433435
"""
434436
Arrow-path extraction for indexed() / not_indexed().
435437
436438
Reads the full result in one shot via the binary endpoint, then slices columns
437-
by Arrow field name (metrics) or label id (attributes). No catalog fetch needed.
439+
by Arrow field name (metrics) or label id (attributes).
440+
441+
Date-granularity attribute columns (year/month/day) are converted to
442+
``pandas.Timestamp`` to match the behaviour of the non-Arrow path.
443+
Week and quarter values remain as strings (same as non-Arrow).
438444
"""
439-
table = execution.bare_exec_response.read_result_arrow()
445+
if _ARROW_IMPORT_ERROR is not None:
446+
raise ImportError(
447+
"pyarrow is required for Arrow support. Install it with: pip install gooddata-pandas[arrow]"
448+
) from _ARROW_IMPORT_ERROR
449+
table = execution.bare_exec_response.read_result_arrow(max_bytes=max_bytes)
440450
exec_def = execution.exec_def
441451

442452
if table.num_rows == 0:
443453
return {col: [] for col in cols}, {idx: [] for idx in index_to_attr_idx}
444454

445455
metric_dim_idx_to_field = build_metric_field_index(table)
456+
model_labels = read_model_labels(table)
446457

447458
data: dict[str, list] = {}
448459
for col in cols:
@@ -451,12 +462,14 @@ def _extract_from_arrow(
451462
data[col] = table.column(field_name).to_pylist()
452463
else:
453464
attr = exec_def.attributes[col_to_attr_idx[col]]
454-
data[col] = table.column(attr.label.id).to_pylist()
465+
label_id = attr.label.id
466+
data[col] = convert_label_values(label_id, table.column(label_id).to_pylist(), model_labels)
455467

456468
index: dict[str, list] = {}
457469
for idx_name, attr_idx in index_to_attr_idx.items():
458470
attr = exec_def.attributes[attr_idx]
459-
index[idx_name] = table.column(attr.label.id).to_pylist()
471+
label_id = attr.label.id
472+
index[idx_name] = convert_label_values(label_id, table.column(label_id).to_pylist(), model_labels)
460473

461474
return data, index
462475

@@ -471,6 +484,7 @@ def compute_and_extract(
471484
is_cancellable: bool = False,
472485
result_page_len: int | None = None,
473486
use_arrow: bool = False,
487+
max_bytes: int | None = None,
474488
) -> tuple[dict, dict]:
475489
"""
476490
Convenience function that computes and extracts data from the execution response.
@@ -489,6 +503,8 @@ def compute_and_extract(
489503
Defaults to 1000. Larger values can improve performance for large result sets.
490504
use_arrow (bool, optional): When True, fetches the result via the Arrow IPC binary
491505
endpoint in one shot instead of paginating through JSON. Requires pyarrow.
506+
max_bytes (Optional[int]): Maximum response body size in bytes for the Arrow path.
507+
Raises ResultSizeBytesLimitExceeded when exceeded. Ignored when use_arrow=False.
492508
493509
Returns:
494510
tuple: A tuple containing the following dictionaries:
@@ -522,6 +538,7 @@ def compute_and_extract(
522538
col_to_attr_idx,
523539
col_to_metric_idx,
524540
index_to_attr_idx,
541+
max_bytes=max_bytes,
525542
)
526543
elif not exec_def.has_attributes():
527544
return _extract_for_metrics_only(execution, cols, col_to_metric_idx), dict()

packages/gooddata-pandas/src/gooddata_pandas/dataframe.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,7 @@ def indexed(
159159
is_cancellable=is_cancellable,
160160
result_page_len=result_page_len,
161161
use_arrow=use_arrow,
162+
max_bytes=self._arrow_config.max_bytes if use_arrow else None,
162163
)
163164

164165
_idx = make_pandas_index(index)
@@ -210,6 +211,7 @@ def not_indexed(
210211
is_cancellable=is_cancellable,
211212
result_page_len=result_page_len,
212213
use_arrow=use_arrow,
214+
max_bytes=self._arrow_config.max_bytes if use_arrow else None,
213215
)
214216

215217
return pandas.DataFrame(data=data)
@@ -539,7 +541,7 @@ def for_exec_def_arrow(
539541
on_execution_submitted(execution)
540542

541543
exec_response = execution.bare_exec_response
542-
table = exec_response.read_result_arrow()
544+
table = exec_response.read_result_arrow(max_bytes=self._arrow_config.max_bytes)
543545
return self._table_to_df_and_metadata(table, exec_response, label_overrides, grand_totals_position)
544546

545547
def for_arrow_table(
@@ -684,7 +686,7 @@ def for_exec_result_id(
684686
result_cache_metadata.execution_response, _check_type=False
685687
),
686688
)
687-
table = exec_response.read_result_arrow()
689+
table = exec_response.read_result_arrow(max_bytes=self._arrow_config.max_bytes)
688690
return self._table_to_df_and_metadata(table, exec_response, label_overrides, grand_totals_position)
689691

690692
return convert_execution_response_to_dataframe(

0 commit comments

Comments
 (0)