openml · amueller · Dec 3, 2019 · Dec 3, 2019 · Dec 3, 2019 · Dec 3, 2019
diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py
@@ -388,6 +388,18 @@ def _parse_data_from_arff(
                                                     'boolean'):
                     col.append(self._unpack_categories(
                         X[column_name], categories_names[column_name]))
+                elif attribute_dtype[column_name] in ('floating',
+                                                      'integer'):
+                    X_col = X[column_name]
+                    if X_col.min() >= 0 and X_col.max() <= 255:
+                        try:
+                            X_col_uint = X_col.astype('uint8')
+                            if (X_col == X_col_uint).all():
+                                col.append(X_col_uint)
+                                continue
+                        except ValueError:
+                            pass
+                    col.append(X[column_name])
                 else:
                     col.append(X[column_name])
             X = pd.concat(col, axis=1)

diff --git a/tests/test_datasets/test_dataset.py b/tests/test_datasets/test_dataset.py
@@ -122,21 +122,29 @@ def test_get_data_no_str_data_for_nparrays(self):
         with pytest.raises(PyOpenMLError, match=err_msg):
             self.titanic.get_data(dataset_format='array')
 
+    def _check_expected_type(self, dtype, is_cat, col):
+        if is_cat:
+            expected_type = 'category'
+        elif not col.isna().any() and (col.astype('uint8') == col).all():
+            expected_type = 'uint8'
+        else:
+            expected_type = 'float64'
+
+        self.assertEqual(dtype.name, expected_type)
+
     def test_get_data_with_rowid(self):
         self.dataset.row_id_attribute = "condition"
         rval, _, categorical, _ = self.dataset.get_data(include_row_id=True)
         self.assertIsInstance(rval, pd.DataFrame)
-        for (dtype, is_cat) in zip(rval.dtypes, categorical):
-            expected_type = 'category' if is_cat else 'float64'
-            self.assertEqual(dtype.name, expected_type)
+        for (dtype, is_cat, col) in zip(rval.dtypes, categorical, rval):
+            self._check_expected_type(dtype, is_cat, rval[col])
         self.assertEqual(rval.shape, (898, 39))
         self.assertEqual(len(categorical), 39)
 
         rval, _, categorical, _ = self.dataset.get_data()
         self.assertIsInstance(rval, pd.DataFrame)
-        for (dtype, is_cat) in zip(rval.dtypes, categorical):
-            expected_type = 'category' if is_cat else 'float64'
-            self.assertEqual(dtype.name, expected_type)
+        for (dtype, is_cat, col) in zip(rval.dtypes, categorical, rval):
+            self._check_expected_type(dtype, is_cat, rval[col])
         self.assertEqual(rval.shape, (898, 38))
         self.assertEqual(len(categorical), 38)
 
@@ -153,9 +161,8 @@ def test_get_data_with_target_array(self):
     def test_get_data_with_target_pandas(self):
         X, y, categorical, attribute_names = self.dataset.get_data(target="class")
         self.assertIsInstance(X, pd.DataFrame)
-        for (dtype, is_cat) in zip(X.dtypes, categorical):
-            expected_type = 'category' if is_cat else 'float64'
-            self.assertEqual(dtype.name, expected_type)
+        for (dtype, is_cat, col) in zip(X.dtypes, categorical, X):
+            self._check_expected_type(dtype, is_cat, X[col])
         self.assertIsInstance(y, pd.Series)
         self.assertEqual(y.dtype.name, 'category')
 
@@ -178,16 +185,14 @@ def test_get_data_rowid_and_ignore_and_target(self):
     def test_get_data_with_ignore_attributes(self):
         self.dataset.ignore_attribute = ["condition"]
         rval, _, categorical, _ = self.dataset.get_data(include_ignore_attribute=True)
-        for (dtype, is_cat) in zip(rval.dtypes, categorical):
-            expected_type = 'category' if is_cat else 'float64'
-            self.assertEqual(dtype.name, expected_type)
+        for (dtype, is_cat, col) in zip(rval.dtypes, categorical, rval):
+            self._check_expected_type(dtype, is_cat, rval[col])
         self.assertEqual(rval.shape, (898, 39))
         self.assertEqual(len(categorical), 39)
 
         rval, _, categorical, _ = self.dataset.get_data(include_ignore_attribute=False)
-        for (dtype, is_cat) in zip(rval.dtypes, categorical):
-            expected_type = 'category' if is_cat else 'float64'
-            self.assertEqual(dtype.name, expected_type)
+        for (dtype, is_cat, col) in zip(rval.dtypes, categorical, rval):
+            self._check_expected_type(dtype, is_cat, col)
         self.assertEqual(rval.shape, (898, 38))
         self.assertEqual(len(categorical), 38)
 

diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py
@@ -352,6 +352,13 @@ def test_get_dataset(self):
         openml.config.server = self.production_server
         self.assertRaises(OpenMLPrivateDatasetError, openml.datasets.get_dataset, 45)
 
+    def test_get_dataset_uint8_dtype(self):
+        dataset = openml.datasets.get_dataset(1)
+        self.assertEqual(type(dataset), OpenMLDataset)
+        self.assertEqual(dataset.name, 'anneal')
+        df, _, _, _ = dataset.get_data()
+        self.assertEqual(df['carbon'].dtype, 'uint8')
+
     def test_get_dataset_lazy(self):
         dataset = openml.datasets.get_dataset(1, download_data=False)
         self.assertEqual(type(dataset), OpenMLDataset)