Skip to content
This repository was archived by the owner on Feb 2, 2024. It is now read-only.

Commit b5221f0

Browse files
authored
Re-implement dataframe boxing based on new structure (#861)
1 parent 3998b13 commit b5221f0

File tree

5 files changed

+100
-124
lines changed

5 files changed

+100
-124
lines changed

sdc/hiframes/boxing.py

Lines changed: 21 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -202,15 +202,10 @@ def box_dataframe(typ, val, c):
202202
context = c.context
203203
builder = c.builder
204204

205-
n_cols = len(typ.columns)
206205
col_names = typ.columns
207206
arr_typs = typ.data
208-
dtypes = [a.dtype for a in arr_typs] # TODO: check Categorical
209207

210208
dataframe = cgutils.create_struct_proxy(typ)(context, builder, value=val)
211-
col_arrs = [builder.extract_value(dataframe.data, i) for i in range(n_cols)]
212-
# df unboxed from Python
213-
has_parent = cgutils.is_not_null(builder, dataframe.parent)
214209

215210
pyapi = c.pyapi
216211
# gil_state = pyapi.gil_ensure() # acquire GIL
@@ -219,28 +214,31 @@ def box_dataframe(typ, val, c):
219214
class_obj = pyapi.import_module_noblock(mod_name)
220215
df_dict = pyapi.dict_new()
221216

222-
for i, cname, arr, arr_typ, dtype in zip(range(n_cols), col_names, col_arrs, arr_typs, dtypes):
217+
arrays_list_objs = {}
218+
for cname, arr_typ in zip(col_names, arr_typs):
223219
# df['cname'] = boxed_arr
224220
# TODO: datetime.date, DatetimeIndex?
225221
name_str = context.insert_const_string(c.builder.module, cname)
226222
cname_obj = pyapi.string_from_string(name_str)
227223

228-
if dtype == string_type:
229-
arr_obj = box_str_arr(arr_typ, arr, c)
230-
elif isinstance(dtype, PDCategoricalDtype):
231-
arr_obj = box_categorical_array(arr_typ, arr, c)
232-
# context.nrt.incref(builder, arr_typ, arr)
233-
elif dtype == types.List(string_type):
234-
arr_obj = box_list(list_string_array_type, arr, c)
235-
# context.nrt.incref(builder, arr_typ, arr) # TODO required?
236-
# pyapi.print_object(arr_obj)
237-
else:
238-
arr_obj = box_array(arr_typ, arr, c)
239-
# TODO: is incref required?
240-
# context.nrt.incref(builder, arr_typ, arr)
224+
col_loc = typ.column_loc[cname]
225+
type_id, col_id = col_loc.type_id, col_loc.col_id
226+
227+
# dataframe.data looks like a tuple(list(array))
228+
# e.g. ([array(int64, 1d, C), array(int64, 1d, C)], [array(float64, 1d, C)])
229+
arrays_list_obj = arrays_list_objs.get(type_id)
230+
if arrays_list_obj is None:
231+
list_typ = types.List(arr_typ)
232+
# extracting list from the tuple
233+
list_val = builder.extract_value(dataframe.data, type_id)
234+
# getting array from the list to box it then
235+
arrays_list_obj = box_list(list_typ, list_val, c)
236+
arrays_list_objs[type_id] = arrays_list_obj
237+
238+
# PyList_GetItem returns borrowed reference
239+
arr_obj = pyapi.list_getitem(arrays_list_obj, col_id)
241240
pyapi.dict_setitem(df_dict, cname_obj, arr_obj)
242241

243-
pyapi.decref(arr_obj)
244242
pyapi.decref(cname_obj)
245243

246244
df_obj = pyapi.call_method(class_obj, "DataFrame", (df_dict,))
@@ -252,6 +250,9 @@ def box_dataframe(typ, val, c):
252250
pyapi.object_setattr_string(df_obj, 'index', arr_obj)
253251
pyapi.decref(arr_obj)
254252

253+
for arrays_list_obj in arrays_list_objs.values():
254+
pyapi.decref(arrays_list_obj)
255+
255256
pyapi.decref(class_obj)
256257
# pyapi.gil_release(gil_state) # release GIL
257258
return df_obj

sdc/tests/test_dataframe.py

Lines changed: 43 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,6 @@ class TestDataFrame(TestCase):
7171

7272
# TODO: Data generator for DataFrames
7373

74-
@dfRefactoringNotImplemented
7574
def test_create1(self):
7675
def test_impl(A, B):
7776
df = pd.DataFrame({'A': A, 'B': B})
@@ -108,7 +107,6 @@ def test_impl():
108107

109108
self.assertEqual(hpat_func(), test_impl())
110109

111-
@dfRefactoringNotImplemented
112110
def test_create_with_series1(self):
113111
def test_impl(n):
114112
A = pd.Series(np.ones(n, dtype=np.int64))
@@ -132,7 +130,6 @@ def test_impl(A):
132130
self.assertEqual(hpat_func(df.A), test_impl(df.A))
133131

134132
@skip_sdc_jit
135-
@dfRefactoringNotImplemented
136133
def test_create_string_index(self):
137134
def test_impl(a):
138135
data = {'A': ['a', 'b'], 'B': [2, 3]}
@@ -142,7 +139,6 @@ def test_impl(a):
142139
hpat_func = sdc.jit(test_impl)
143140
pd.testing.assert_frame_equal(hpat_func(True), test_impl(True))
144141

145-
@dfRefactoringNotImplemented
146142
def test_create_cond1(self):
147143
def test_impl(A, B, c):
148144
if c:
@@ -232,7 +228,6 @@ def test_impl(n):
232228
do_check = False if platform.system() == 'Windows' and not IS_32BITS else True
233229
pd.testing.assert_frame_equal(hpat_func(n), test_impl(n), check_dtype=do_check)
234230

235-
@dfRefactoringNotImplemented
236231
def test_box2(self):
237232
def test_impl():
238233
df = pd.DataFrame({'A': [1, 2, 3], 'B': ['a', 'bb', 'ccc']})
@@ -978,7 +973,6 @@ def test_impl(df):
978973
with self.subTest(index=idx):
979974
pd.testing.assert_frame_equal(sdc_func(df), test_impl(df))
980975

981-
@dfRefactoringNotImplemented # required re-implementing DataFrame boxing
982976
def test_df_isna_no_unboxing(self):
983977
def test_impl():
984978
df = pd.DataFrame({
@@ -1164,7 +1158,6 @@ def test_impl(df, n, k):
11641158
with self.subTest(index=idx, n=n, k=k):
11651159
pd.testing.assert_frame_equal(sdc_func(df, n, k), test_impl(df, n, k))
11661160

1167-
@dfRefactoringNotImplemented # required re-implementing DataFrame boxing
11681161
def test_df_iloc_slice_no_unboxing(self):
11691162
def test_impl(n, k):
11701163
df = pd.DataFrame({
@@ -1280,7 +1273,6 @@ def test_impl(df, n):
12801273
with self.subTest(index=idx, n=n):
12811274
pd.testing.assert_frame_equal(sdc_func(df, n), test_impl(df, n))
12821275

1283-
@dfRefactoringNotImplemented # required re-implementing DataFrame boxing
12841276
def test_df_iloc_list_no_unboxing(self):
12851277
def test_impl(n):
12861278
df = pd.DataFrame({
@@ -1310,7 +1302,6 @@ def test_impl(df, n):
13101302
with self.subTest(index=idx, n=n):
13111303
pd.testing.assert_frame_equal(sdc_func(df, n), test_impl(df, n))
13121304

1313-
@dfRefactoringNotImplemented # required re-implementing DataFrame boxing
13141305
def test_df_iloc_list_bool_no_unboxing(self):
13151306
def test_impl(n):
13161307
df = pd.DataFrame({
@@ -1429,7 +1420,6 @@ def test_impl(df):
14291420
"C": [3.1, 8.4, 7.1, 3.2, 1]}, index=idx)
14301421
pd.testing.assert_frame_equal(sdc_func(df), test_impl(df))
14311422

1432-
@dfRefactoringNotImplemented # required re-implementing DataFrame boxing
14331423
def test_df_loc_no_unboxing(self):
14341424
def test_impl():
14351425
df = pd.DataFrame({
@@ -1489,7 +1479,6 @@ def impl(a):
14891479
)
14901480
pd.testing.assert_frame_equal(sdc_func(df), ref_impl(df))
14911481

1492-
@dfRefactoringNotImplemented # required re-implementing DataFrame boxing
14931482
def test_df_head_no_unboxing(self):
14941483
def test_impl(n):
14951484
df = pd.DataFrame({
@@ -1522,7 +1511,6 @@ def test_impl(df, deep):
15221511
with self.subTest(index=idx, deep=deep):
15231512
pd.testing.assert_frame_equal(sdc_func(df, deep), test_impl(df, deep))
15241513

1525-
@dfRefactoringNotImplemented # required re-implementing DataFrame boxing
15261514
def test_df_copy_no_unboxing(self):
15271515
def test_impl(idx, deep):
15281516
df = pd.DataFrame({
@@ -1534,15 +1522,28 @@ def test_impl(idx, deep):
15341522
return df.copy(deep=deep)
15351523

15361524
sdc_impl = sdc.jit(test_impl)
1537-
indexes = [[3, 4, 2, 6, 1], ['a', 'b', 'c', 'd', 'e'], None]
1525+
indexes = [[3, 4, 2, 6, 1], ['a', 'b', 'c', 'd', 'e']]
15381526
cases_deep = [None, True, False]
15391527
for idx, deep in product(indexes, cases_deep):
15401528
with self.subTest(index=idx, deep=deep):
15411529
jit_result = sdc_impl(idx, deep)
15421530
ref_result = test_impl(idx, deep)
15431531
pd.testing.assert_frame_equal(jit_result, ref_result)
15441532

1545-
@dfRefactoringNotImplemented # required re-implementing DataFrame boxing
1533+
@unittest.expectedFailure
1534+
def test_df_copy_no_unboxing_none_index_error(self):
1535+
def test_impl():
1536+
df = pd.DataFrame({
1537+
'A': [3.2, np.nan, 7.0, 3.3, np.nan],
1538+
'B': [3, 4, 1, 0, 222],
1539+
'C': [True, True, False, False, True],
1540+
'D': ['a', 'dd', 'c', '12', None]
1541+
}, index=None)
1542+
return df.copy(deep=True)
1543+
1544+
sdc_impl = sdc.jit(test_impl)
1545+
pd.testing.assert_frame_equal(sdc_impl(), test_impl())
1546+
15461547
def test_pct_change1(self):
15471548
def test_impl(n):
15481549
df = pd.DataFrame({'A': np.arange(n) + 1.0, 'B': np.arange(n) + 1})
@@ -1714,7 +1715,6 @@ def test_impl(df):
17141715
hpat_func = self.jit(test_impl)
17151716
pd.testing.assert_frame_equal(hpat_func(df), test_impl(df))
17161717

1717-
@dfRefactoringNotImplemented # required re-implementing DataFrame boxing
17181718
def test_df_reset_index_drop_literal_index_int_no_unboxing(self):
17191719
def gen_test_impl(drop):
17201720
def test_impl():
@@ -1745,7 +1745,6 @@ def test_impl(df):
17451745

17461746
pd.testing.assert_frame_equal(hpat_func(df), test_impl(df))
17471747

1748-
@dfRefactoringNotImplemented # required re-implementing DataFrame boxing
17491748
def test_df_reset_index_drop_default_index_int_no_unboxing(self):
17501749
def test_impl():
17511750
df = pd.DataFrame({
@@ -1842,20 +1841,17 @@ def test_impl(df):
18421841
index=index)
18431842
pd.testing.assert_frame_equal(sdc_func(df), test_impl(df))
18441843

1845-
@dfRefactoringNotImplemented # required re-implementing DataFrame boxing
18461844
def test_df_drop_one_column(self):
18471845
def test_impl(index):
1848-
df = pd.DataFrame({'A': [1.0, 2.0, np.nan, 1.0], 'B': [4, 5, 6, 7], 'C': [1.0, 2.0, np.nan, 1.0]},
1849-
index=index)
1846+
df = pd.DataFrame({
1847+
'A': [1.0, 2.0, np.nan, 1.0],
1848+
'B': [4, 5, 6, 7],
1849+
'C': [1.0, 2.0, np.nan, 1.0]
1850+
}, index=index)
18501851
return df.drop(columns='A')
18511852

1852-
index_to_test = [[1, 2, 3, 4],
1853-
[.1, .2, .3, .4],
1854-
['a', 'b', 'c', 'd']]
1855-
18561853
sdc_func = self.jit(test_impl)
1857-
1858-
for index in index_to_test:
1854+
for index in [[1, 2, 3, 4], [.1, .2, .3, .4], ['a', 'b', 'c', 'd']]:
18591855
with self.subTest(index=index):
18601856
pd.testing.assert_frame_equal(sdc_func(index), test_impl(index))
18611857

@@ -1884,7 +1880,6 @@ def test_impl(df):
18841880
index=index)
18851881
pd.testing.assert_frame_equal(sdc_func(df), test_impl(df))
18861882

1887-
@dfRefactoringNotImplemented # required re-implementing DataFrame boxing
18881883
def test_df_drop_tuple_column(self):
18891884
def gen_test_impl(do_jit=False):
18901885
def test_impl(index):
@@ -2037,7 +2032,6 @@ def test_impl(df, arr):
20372032
sdc_func = self.jit(test_impl)
20382033
pd.testing.assert_frame_equal(sdc_func(df, arr), test_impl(df, arr))
20392034

2040-
@dfRefactoringNotImplemented # required re-implementing DataFrame boxing
20412035
def test_df_getitem_bool_array_even_idx_no_unboxing(self):
20422036
def test_impl(arr):
20432037
df = pd.DataFrame({
@@ -2139,7 +2133,6 @@ def test_impl(idx):
21392133
sdc_func = self.jit(test_impl)
21402134
pd.testing.assert_series_equal(sdc_func('A'), test_impl('A'))
21412135

2142-
@dfRefactoringNotImplemented # required re-implementing DataFrame boxing
21432136
def test_df_getitem_slice_idx_no_unboxing(self):
21442137
def test_impl():
21452138
df = pd.DataFrame({
@@ -2150,9 +2143,8 @@ def test_impl():
21502143
return df[1:3]
21512144

21522145
sdc_func = self.jit(test_impl)
2153-
pd.testing.assert_series_equal(sdc_func(), test_impl())
2146+
pd.testing.assert_frame_equal(sdc_func(), test_impl())
21542147

2155-
@dfRefactoringNotImplemented # required re-implementing DataFrame boxing
21562148
def test_df_getitem_unbox_slice_idx_no_unboxing(self):
21572149
def test_impl(start, end):
21582150
df = pd.DataFrame({
@@ -2163,9 +2155,8 @@ def test_impl(start, end):
21632155
return df[start:end]
21642156

21652157
sdc_func = self.jit(test_impl)
2166-
pd.testing.assert_series_equal(sdc_func(1, 3), test_impl(1, 3))
2158+
pd.testing.assert_frame_equal(sdc_func(1, 3), test_impl(1, 3))
21672159

2168-
@dfRefactoringNotImplemented # required re-implementing DataFrame boxing
21692160
def test_df_getitem_tuple_idx_no_unboxing(self):
21702161
def gen_test_impl(do_jit=False):
21712162
def test_impl():
@@ -2183,9 +2174,8 @@ def test_impl():
21832174

21842175
test_impl = gen_test_impl()
21852176
sdc_func = self.jit(gen_test_impl(do_jit=True))
2186-
pd.testing.assert_series_equal(sdc_func(), test_impl())
2177+
pd.testing.assert_frame_equal(sdc_func(), test_impl())
21872178

2188-
@dfRefactoringNotImplemented # required re-implementing DataFrame boxing
21892179
def test_df_getitem_bool_series_idx_no_unboxing(self):
21902180
def test_impl():
21912181
df = pd.DataFrame({
@@ -2196,7 +2186,7 @@ def test_impl():
21962186
return df[df['A'] == -1.]
21972187

21982188
sdc_func = self.jit(test_impl)
2199-
pd.testing.assert_series_equal(sdc_func(), test_impl())
2189+
pd.testing.assert_frame_equal(sdc_func(), test_impl())
22002190

22012191
@skip_sdc_jit('DF.getitem unsupported Series name')
22022192
@dfRefactoringNotImplemented
@@ -2295,7 +2285,6 @@ def test_impl(df, df2):
22952285
df2.A[n // 2:] = n
22962286
pd.testing.assert_frame_equal(sdc_func(df, df2), test_impl(df, df2))
22972287

2298-
@dfRefactoringNotImplemented # required re-implementing DataFrame boxing
22992288
def test_append_df_same_cols_no_index_no_unboxing(self):
23002289
def test_impl():
23012290
n = 11
@@ -2304,8 +2293,14 @@ def test_impl():
23042293
df2.A[n // 2:] = n
23052294
return df.append(df2, ignore_index=True)
23062295

2307-
sdc_func = self.jit(test_impl)
2308-
pd.testing.assert_frame_equal(sdc_func(), test_impl())
2296+
sdc_impl = self.jit(test_impl)
2297+
2298+
kwargs = {}
2299+
if platform.system() == 'Windows':
2300+
# Attribute "dtype" are different on windows int64 vs int32
2301+
kwargs['check_dtype'] = False
2302+
2303+
pd.testing.assert_frame_equal(sdc_impl(), test_impl(), **kwargs)
23092304

23102305
@dfRefactoringNotImplemented # required re-implementing DataFrame unboxing
23112306
def test_append_df_same_cols_index_default(self):
@@ -2334,20 +2329,23 @@ def test_impl(df, df2):
23342329

23352330
pd.testing.assert_frame_equal(sdc_func(df, df2), test_impl(df, df2))
23362331

2337-
@dfRefactoringNotImplemented # required re-implementing DataFrame boxing
23382332
def test_append_df_diff_cols_index_ignore_false_no_unboxing(self):
23392333
def test_impl():
23402334
n1 = 11
23412335
n2 = n1 * 2
2342-
df = pd.DataFrame({'A': np.arange(n1), 'B': np.arange(n1) ** 2},
2343-
index=np.arange(n1) ** 4)
2344-
df2 = pd.DataFrame({'C': np.arange(n2), 'D': np.arange(n2) ** 2,
2345-
'E S D': np.arange(n2) + 100},
2346-
index=np.arange(n2) ** 8)
2336+
df = pd.DataFrame({
2337+
'A': np.arange(n1), 'B': np.arange(n1) ** 2
2338+
}, index=np.arange(n1) ** 2)
2339+
df2 = pd.DataFrame({
2340+
'C': np.arange(n2), 'D': np.arange(n2) ** 2,
2341+
'E S D': np.arange(n2) + 100
2342+
}, index=np.arange(n2) ** 4)
23472343
return df.append(df2, ignore_index=False)
23482344

23492345
sdc_func = self.jit(test_impl)
2350-
pd.testing.assert_frame_equal(sdc_func(), test_impl())
2346+
res_jit = sdc_func()
2347+
res_ref = test_impl()
2348+
pd.testing.assert_frame_equal(res_jit, res_ref)
23512349

23522350
@dfRefactoringNotImplemented # required re-implementing DataFrame unboxing
23532351
def test_append_df_diff_cols_index_ignore_index(self):

0 commit comments

Comments
 (0)