Skip to content

Commit 4d81813

Browse files
committed
feat(gooddata-pandas): adopt AttributeConverterStore in arrow DF loading
risk: low
1 parent 4cb8139 commit 4d81813

4 files changed

Lines changed: 521 additions & 19 deletions

File tree

packages/gooddata-pandas/src/gooddata_pandas/arrow_convertor.py

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
import orjson
77
import pandas
8+
from gooddata_sdk.type_converter import AttributeConverterStore
89

910
from gooddata_pandas.arrow_types import TypesMapper
1011

@@ -43,6 +44,61 @@
4344
_REQUIRED_SCHEMA_KEYS = (_META_XTAB, _META_MODEL, _META_VIEW)
4445

4546

47+
def read_model_labels(table: pa.Table) -> dict:
48+
"""Return the ``labels`` dict from the Arrow table's ``x-gdc-model-v1`` schema metadata.
49+
50+
Returns an empty dict when the metadata key is absent so callers can use it
51+
unconditionally without extra None-checks.
52+
"""
53+
if not table.schema.metadata or b"x-gdc-model-v1" not in table.schema.metadata:
54+
return {}
55+
return orjson.loads(table.schema.metadata[b"x-gdc-model-v1"]).get("labels", {})
56+
57+
58+
def _get_date_converter_for_label(label_id: str, model_labels: dict):
59+
"""Return a type Converter for date-granularity labels, or None for plain text attributes.
60+
61+
Reads the ``granularity`` field from Arrow model metadata (``x-gdc-model-v1``) and
62+
looks up the matching converter in ``AttributeConverterStore``.
63+
64+
- ``DAY`` / ``MONTH`` / ``YEAR`` → ``DateConverter`` (→ ``pandas.Timestamp`` via external fn)
65+
- ``WEEK`` / ``QUARTER`` → ``StringConverter`` (no-op)
66+
- ``MINUTE`` / ``HOUR`` → ``DatetimeConverter``
67+
- No granularity (text attrs) → ``None`` (caller skips conversion)
68+
"""
69+
info = model_labels.get(label_id, {})
70+
granularity = info.get("granularity")
71+
if not granularity:
72+
return None
73+
return AttributeConverterStore.find_converter("DATE", granularity.upper())
74+
75+
76+
def convert_label_values(label_id: str, values: list, model_labels: dict) -> list:
77+
"""Apply date-granularity type conversion to a list of attribute values from an Arrow column.
78+
79+
Mirrors the non-Arrow execution path (``AttributeConverterStore`` in ``_typed_attribute_value``):
80+
81+
- ``DAY`` / ``MONTH`` / ``YEAR`` granularity → ``pandas.Timestamp``
82+
- ``WEEK`` / ``QUARTER`` → ``str`` (unchanged)
83+
- No granularity (text attributes) → values returned as the **same object**
84+
85+
``None`` values are passed through unchanged.
86+
87+
Args:
88+
label_id: Arrow column name / GoodData label local ID.
89+
values: Raw values from ``table.column(label_id).to_pylist()``.
90+
model_labels: The ``labels`` dict from ``x-gdc-model-v1`` schema metadata
91+
(as returned by :func:`read_model_labels`).
92+
93+
Returns:
94+
Converted list, or the original *values* object when no conversion is needed.
95+
"""
96+
converter = _get_date_converter_for_label(label_id, model_labels)
97+
if converter is None:
98+
return values
99+
return [converter.to_external_type(v) if v is not None else None for v in values]
100+
101+
46102
def build_metric_field_index(table: pa.Table) -> dict[int, str]:
47103
"""Return {metric_dimension_index: arrow_field_name} from the table schema.
48104

packages/gooddata-pandas/src/gooddata_pandas/data_access.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
from gooddata_sdk.utils import IdObjType
2020

2121
try:
22-
from gooddata_pandas.arrow_convertor import build_metric_field_index
22+
from gooddata_pandas.arrow_convertor import build_metric_field_index, convert_label_values, read_model_labels
2323
except ImportError:
2424
pass # Only needed when use_arrow=True; callers guard with _ARROW_AVAILABLE checks
2525

@@ -434,7 +434,11 @@ def _extract_from_arrow(
434434
Arrow-path extraction for indexed() / not_indexed().
435435
436436
Reads the full result in one shot via the binary endpoint, then slices columns
437-
by Arrow field name (metrics) or label id (attributes). No catalog fetch needed.
437+
by Arrow field name (metrics) or label id (attributes).
438+
439+
Date-granularity attribute columns (year/month/day) are converted to
440+
``pandas.Timestamp`` to match the behaviour of the non-Arrow path.
441+
Week and quarter values remain as strings (same as non-Arrow).
438442
"""
439443
table = execution.bare_exec_response.read_result_arrow()
440444
exec_def = execution.exec_def
@@ -443,6 +447,7 @@ def _extract_from_arrow(
443447
return {col: [] for col in cols}, {idx: [] for idx in index_to_attr_idx}
444448

445449
metric_dim_idx_to_field = build_metric_field_index(table)
450+
model_labels = read_model_labels(table)
446451

447452
data: dict[str, list] = {}
448453
for col in cols:
@@ -451,12 +456,14 @@ def _extract_from_arrow(
451456
data[col] = table.column(field_name).to_pylist()
452457
else:
453458
attr = exec_def.attributes[col_to_attr_idx[col]]
454-
data[col] = table.column(attr.label.id).to_pylist()
459+
label_id = attr.label.id
460+
data[col] = convert_label_values(label_id, table.column(label_id).to_pylist(), model_labels)
455461

456462
index: dict[str, list] = {}
457463
for idx_name, attr_idx in index_to_attr_idx.items():
458464
attr = exec_def.attributes[attr_idx]
459-
index[idx_name] = table.column(attr.label.id).to_pylist()
465+
label_id = attr.label.id
466+
index[idx_name] = convert_label_values(label_id, table.column(label_id).to_pylist(), model_labels)
460467

461468
return data, index
462469

packages/gooddata-pandas/tests/dataframe/fixtures/arrow/manifest.json

Lines changed: 85 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
[
22
{
33
"name": "flat_attrs_and_metrics",
4-
"description": "Single dim with attributes and measureGroup \u2014 flat table",
4+
"description": "Single dim with attributes and measureGroup flat table",
55
"shape": [
66
96,
77
1
@@ -11,7 +11,7 @@
1111
},
1212
{
1313
"name": "two_dim_metrics_in_rows",
14-
"description": "measureGroup in dim0 \u2192 each metric is a row; attributes fan out as columns",
14+
"description": "measureGroup in dim0 each metric is a row; attributes fan out as columns",
1515
"shape": [
1616
2,
1717
17
@@ -21,7 +21,7 @@
2121
},
2222
{
2323
"name": "two_dim_metrics_in_cols",
24-
"description": "Attributes in dim0 (rows), measureGroup in dim1 (columns) \u2014 most common layout",
24+
"description": "Attributes in dim0 (rows), measureGroup in dim1 (columns) most common layout",
2525
"shape": [
2626
48,
2727
8
@@ -31,7 +31,7 @@
3131
},
3232
{
3333
"name": "wide_few_rows_many_cols",
34-
"description": "Single attribute in dim0, multiple attributes + measureGroup in dim1 \u2192 wide DataFrame",
34+
"description": "Single attribute in dim0, multiple attributes + measureGroup in dim1 wide DataFrame",
3535
"shape": [
3636
4,
3737
96
@@ -41,7 +41,7 @@
4141
},
4242
{
4343
"name": "metrics_only",
44-
"description": "No attributes \u2014 single row of aggregate metric values",
44+
"description": "No attributes single row of aggregate metric values",
4545
"shape": [
4646
2,
4747
1
@@ -51,7 +51,7 @@
5151
},
5252
{
5353
"name": "single_metric_many_rows",
54-
"description": "Three attributes + one metric \u2014 tall narrow table",
54+
"description": "Three attributes + one metric tall narrow table",
5555
"shape": [
5656
182,
5757
1
@@ -81,7 +81,7 @@
8181
},
8282
{
8383
"name": "totals_grand_row_two_col_labels",
84-
"description": "Grand total rows; column dim has two labels \u2014 tests index padding",
84+
"description": "Grand total rows; column dim has two labels tests index padding",
8585
"shape": [
8686
96,
8787
18
@@ -91,7 +91,7 @@
9191
},
9292
{
9393
"name": "totals_grand_col_two_row_labels",
94-
"description": "Grand total columns; row dim has two labels \u2014 tests column index padding",
94+
"description": "Grand total columns; row dim has two labels tests column index padding",
9595
"shape": [
9696
18,
9797
96
@@ -131,7 +131,7 @@
131131
},
132132
{
133133
"name": "dim_r_m",
134-
"description": "dim0=[region], dim1=[measureGroup] \u2014 1 row attr, metrics in cols",
134+
"description": "dim0=[region], dim1=[measureGroup] 1 row attr, metrics in cols",
135135
"shape": [
136136
5,
137137
2
@@ -161,7 +161,7 @@
161161
},
162162
{
163163
"name": "dim_m_c",
164-
"description": "dim0=[measureGroup], dim1=[products.category] \u2014 metrics in rows, 1 col attr",
164+
"description": "dim0=[measureGroup], dim1=[products.category] metrics in rows, 1 col attr",
165165
"shape": [
166166
2,
167167
4
@@ -251,7 +251,7 @@
251251
},
252252
{
253253
"name": "tot_d0_sub",
254-
"description": "Base A: subtotal per region (rolls up category) \u2192 extra rows in dim0",
254+
"description": "Base A: subtotal per region (rolls up category) extra rows in dim0",
255255
"shape": [
256256
44,
257257
4
@@ -261,7 +261,7 @@
261261
},
262262
{
263263
"name": "tot_d0_grand",
264-
"description": "Base A: grand total of dim0 (all items) \u2192 extra column in dim1",
264+
"description": "Base A: grand total of dim0 (all items) extra column in dim1",
265265
"shape": [
266266
34,
267267
4
@@ -281,7 +281,7 @@
281281
},
282282
{
283283
"name": "tot_d1_sub",
284-
"description": "Base B: subtotal per order_status (rolls up date.year) \u2192 extra columns in dim1",
284+
"description": "Base B: subtotal per order_status (rolls up date.year) extra columns in dim1",
285285
"shape": [
286286
18,
287287
36
@@ -291,7 +291,7 @@
291291
},
292292
{
293293
"name": "tot_d1_grand",
294-
"description": "Base B: grand total of dim1 (all items) \u2192 extra row in dim0",
294+
"description": "Base B: grand total of dim1 (all items) extra row in dim0",
295295
"shape": [
296296
18,
297297
30
@@ -321,7 +321,7 @@
321321
},
322322
{
323323
"name": "tot_d0sub_d1grand",
324-
"description": "Base C: row subtotals per region + grand total of dim1 \u2192 extra col rows + SUM row",
324+
"description": "Base C: row subtotals per region + grand total of dim1 extra col rows + SUM row",
325325
"shape": [
326326
44,
327327
16
@@ -358,5 +358,75 @@
358358
],
359359
"result_id": "b592a3f77e55d5e3d4450891c0d5c1fbedcac173:f9ed5a4e4b8cf21716cc409cd87664ef7da3444d4f07c4864989916d420bf85f",
360360
"dir": "totals_both_dims"
361+
},
362+
{
363+
"name": "date_year_in_rows",
364+
"description": "Date attribute (YEAR granularity) in row dimension — Arrow path date→Timestamp parity",
365+
"shape": [
366+
3,
367+
1
368+
],
369+
"result_id": "0000000000000000000000000000000000000000:date_year_in_rows",
370+
"dir": "date_year_in_rows"
371+
},
372+
{
373+
"name": "date_month_in_rows",
374+
"description": "Date attribute (MONTH granularity) in row dimension — Arrow path date→Timestamp parity",
375+
"shape": [
376+
3,
377+
1
378+
],
379+
"result_id": "0000000000000000000000000000000000000000:date_month_in_rows",
380+
"dir": "date_month_in_rows"
381+
},
382+
{
383+
"name": "date_day_in_rows",
384+
"description": "Date attribute (DAY granularity) in row dimension — Arrow path date→Timestamp parity",
385+
"shape": [
386+
2,
387+
1
388+
],
389+
"result_id": "0000000000000000000000000000000000000000:date_day_in_rows",
390+
"dir": "date_day_in_rows"
391+
},
392+
{
393+
"name": "date_week_in_rows",
394+
"description": "Date attribute (WEEK granularity) in row dimension — stays string, no Timestamp conversion",
395+
"shape": [
396+
2,
397+
1
398+
],
399+
"result_id": "0000000000000000000000000000000000000000:date_week_in_rows",
400+
"dir": "date_week_in_rows"
401+
},
402+
{
403+
"name": "date_quarter_in_rows",
404+
"description": "Date attribute (QUARTER granularity) in row dimension — stays string, no Timestamp conversion",
405+
"shape": [
406+
2,
407+
1
408+
],
409+
"result_id": "0000000000000000000000000000000000000000:date_quarter_in_rows",
410+
"dir": "date_quarter_in_rows"
411+
},
412+
{
413+
"name": "empty_two_dim_attrs_metrics",
414+
"description": "Standard two-dim layout (attr rows, metric cols), 0 data rows",
415+
"shape": [
416+
0,
417+
2
418+
],
419+
"result_id": "0000000000000000000000000000000000000000:empty_two_dim_attrs_metrics",
420+
"dir": "empty_two_dim_attrs_metrics"
421+
},
422+
{
423+
"name": "empty_flat_attrs_metrics",
424+
"description": "Single-dim (flat) layout, attr and metrics co-dim, 0 data rows",
425+
"shape": [
426+
0,
427+
2
428+
],
429+
"result_id": "0000000000000000000000000000000000000000:empty_flat_attrs_metrics",
430+
"dir": "empty_flat_attrs_metrics"
361431
}
362432
]

0 commit comments

Comments
 (0)