openml · janvanrijn · Jan 9, 2024 · Jan 9, 2024 · Jan 10, 2024 · Jan 10, 2024
diff --git a/openml/datasets/data_feature.py b/openml/datasets/data_feature.py
@@ -23,6 +23,10 @@ class OpenMLDataFeature:
         list of the possible values, in case of nominal attribute
     number_missing_values : int
         Number of rows that have a missing value for this feature.
+    ontologies : list(str)
+        list of ontologies attached to this feature. An ontology describes the
+        concept that are described in a feature. An ontology is defined by an
+        URL where the information is provided.
     """
 
     LEGAL_DATA_TYPES: ClassVar[Sequence[str]] = ["nominal", "numeric", "string", "date"]
@@ -34,6 +38,7 @@ def __init__(  # noqa: PLR0913
         data_type: str,
         nominal_values: list[str],
         number_missing_values: int,
+        ontologies: list[str] | None = None,
     ):
         if not isinstance(index, int):
             raise TypeError(f"Index must be `int` but is {type(index)}")
@@ -65,6 +70,7 @@ def __init__(  # noqa: PLR0913
         self.index = index
         self.name = str(name)
         self.data_type = str(data_type)
+        self.ontologies = ontologies
         self.nominal_values = nominal_values
         self.number_missing_values = number_missing_values
 

diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py
@@ -1069,6 +1069,7 @@ def _parse_features_xml(features_xml_string: str) -> dict[int, OpenMLDataFeature
             xmlfeature["oml:data_type"],
             xmlfeature.get("oml:nominal_value"),
             int(nr_missing),
+            xmlfeature.get("oml:ontology"),
         )
         if idx != feature.index:
             raise ValueError("Data features not provided in right order")

diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py
@@ -1061,6 +1061,63 @@ def fork_dataset(data_id: int) -> int:
     return int(data_id)
 
 
+def data_feature_add_ontology(data_id: int, index: int, ontology: str) -> bool:
+    """
+    An ontology describes the concept that are described in a feature. An
+    ontology is defined by an URL where the information is provided. Adds
+    an ontology (URL) to a given dataset feature (defined by a dataset id
+    and index). The dataset has to exists on OpenML and needs to have been
+    processed by the evaluation engine.
+
+    Parameters
+    ----------
+    data_id : int
+        id of the dataset to which the feature belongs
+
+    index : int
+        index of the feature in dataset (0-based)
+
+    ontology : str
+        URL to ontology (max. 256 characters)
+
+    Returns
+    -------
+    True or throws an OpenML server exception
+    """
+    upload_data = {"data_id": data_id, "index": index, "ontology": ontology}
+    openml._api_calls._perform_api_call("data/feature/ontology/add", "post", data=upload_data)
+    # an error will be thrown in case the request was unsuccessful
+    return True
+
+
+def data_feature_remove_ontology(data_id: int, index: int, ontology: str) -> bool:
+    """
+    Removes an existing ontology (URL) from a given dataset feature (defined
+    by a dataset id and index). The dataset has to exists on OpenML and needs
+    to have been processed by the evaluation engine. Ontology needs to be
+    attached to the specific fearure.
+
+    Parameters
+    ----------
+    data_id : int
+        id of the dataset to which the feature belongs
+
+    index : int
+        index of the feature in dataset (0-based)
+
+    ontology : str
+        URL to ontology (max. 256 characters)
+
+    Returns
+    -------
+    True or throws an OpenML server exception
+    """
+    upload_data = {"data_id": data_id, "index": index, "ontology": ontology}
+    openml._api_calls._perform_api_call("data/feature/ontology/remove", "post", data=upload_data)
+    # an error will be thrown in case the request was unsuccessful
+    return True
+
+
 def _topic_add_dataset(data_id: int, topic: str) -> int:
     """
     Adds a topic for a dataset.

diff --git a/tests/files/org/openml/test/datasets/1/dataset.arff b/tests/files/org/openml/test/datasets/1/dataset.arff
diff --git a/tests/files/org/openml/test/datasets/1/description.xml b/tests/files/org/openml/test/datasets/1/description.xml
@@ -0,0 +1,130 @@
+<oml:data_set_description xmlns:oml="http://openml.org/openml">
+  <oml:id>1</oml:id>
+  <oml:name>anneal</oml:name>
+  <oml:version>1</oml:version>
+  <oml:description>**Author**:   
+**Source**: Unknown -   
+**Please cite**:   
+
+1. Title of Database: Annealing Data
+
+ 2. Source Information: donated by David Sterling and Wray Buntine.
+
+ 3. Past Usage: unknown
+
+ 4. Relevant Information:
+    -- Explanation: I suspect this was left by Ross Quinlan in 1987 at the
+       4th Machine Learning Workshop.  I'd have to check with Jeff Schlimmer
+       to double check this.
+
+ 5. Number of Instances: 798
+
+ 6. Number of Attributes: 38
+    -- 6 continuously-valued
+    -- 3 integer-valued
+    -- 29 nominal-valued
+
+ 7. Attribute Information:
+     1. family:          --,GB,GK,GS,TN,ZA,ZF,ZH,ZM,ZS
+     2. product-type:    C, H, G
+     3. steel:           -,R,A,U,K,M,S,W,V
+     4. carbon:          continuous
+     5. hardness:        continuous
+     6. temper_rolling:  -,T
+     7. condition:       -,S,A,X
+     8. formability:     -,1,2,3,4,5
+     9. strength:        continuous
+    10. non-ageing:      -,N
+    11. surface-finish:  P,M,-
+    12. surface-quality: -,D,E,F,G
+    13. enamelability:   -,1,2,3,4,5
+    14. bc:              Y,-
+    15. bf:              Y,-
+    16. bt:              Y,-
+    17. bw/me:           B,M,-
+    18. bl:              Y,-
+    19. m:               Y,-
+    20. chrom:           C,-
+    21. phos:            P,-
+    22. cbond:           Y,-
+    23. marvi:           Y,-
+    24. exptl:           Y,-
+    25. ferro:           Y,-
+    26. corr:            Y,-
+    27. blue/bright/varn/clean:          B,R,V,C,-
+    28. lustre:          Y,-
+    29. jurofm:          Y,-
+    30. s:               Y,-
+    31. p:               Y,-
+    32. shape:           COIL, SHEET
+    33. thick:           continuous
+    34. width:           continuous
+    35. len:             continuous
+    36. oil:             -,Y,N
+    37. bore:            0000,0500,0600,0760
+    38. packing: -,1,2,3
+    classes:        1,2,3,4,5,U
+
+    -- The '-' values are actually 'not_applicable' values rather than
+       'missing_values' (and so can be treated as legal discrete
+       values rather than as showing the absence of a discrete value).
+
+ 8. Missing Attribute Values: Signified with &quot;?&quot;
+    Attribute:  Number of instances missing its value:
+    1           0
+    2           0
+    3           70
+    4           0
+    5           0
+    6           675
+    7           271
+    8           283
+    9           0
+   10           703
+   11           790
+   12           217
+   13           785
+   14           797
+   15           680
+   16           736
+   17           609
+   18           662
+   19           798
+   20           775
+   21           791
+   22           730
+   23           798
+   24           796
+   25           772
+   26           798
+   27           793
+   28           753
+   29           798
+   30           798
+   31           798
+   32           0
+   33           0
+   34           0
+   35           0
+   36           740
+   37           0
+   38           789
+   39           0
+
+ 9. Distribution of Classes
+      Class Name:   Number of Instances:
+      1               8
+      2              88
+      3             608
+      4               0
+      5              60
+      U              34
+                    ---
+                    798</oml:description>
+  <oml:description_version>1</oml:description_version>
+  <oml:format>ARFF</oml:format>
+        <oml:upload_date>2014-04-06T23:19:24</oml:upload_date>
+    <oml:licence>Public</oml:licence>  <oml:url>https://test.openml.org/data/v1/download/1/anneal.arff</oml:url>
+    <oml:file_id>1</oml:file_id>  <oml:default_target_attribute>class</oml:default_target_attribute>      <oml:version_label>1</oml:version_label>    <oml:tag>study_14</oml:tag><oml:tag>test_lazy_tag_107378</oml:tag><oml:tag>test_lazy_tag_120303</oml:tag><oml:tag>test_lazy_tag_131210</oml:tag><oml:tag>test_lazy_tag_254767</oml:tag><oml:tag>test_lazy_tag_38672</oml:tag><oml:tag>test_lazy_tag_452011</oml:tag><oml:tag>test_lazy_tag_45600</oml:tag><oml:tag>test_lazy_tag_574410</oml:tag><oml:tag>test_lazy_tag_624164</oml:tag><oml:tag>test_lazy_tag_834703</oml:tag><oml:tag>test_lazy_tag_898124</oml:tag><oml:tag>test_lazy_tag_959534</oml:tag>  <oml:visibility>public</oml:visibility>        <oml:status>active</oml:status>
+  <oml:processing_date>2024-01-10 13:50:55</oml:processing_date>      <oml:md5_checksum>4eaed8b6ec9d8211024b6c089b064761</oml:md5_checksum>
+</oml:data_set_description>