Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions openml/datasets/data_feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,10 @@ class OpenMLDataFeature:
list of the possible values, in case of nominal attribute
number_missing_values : int
Number of rows that have a missing value for this feature.
ontologies : list(str)
list of ontologies attached to this feature. An ontology describes the
concept that are described in a feature. An ontology is defined by an
URL where the information is provided.
"""

LEGAL_DATA_TYPES: ClassVar[Sequence[str]] = ["nominal", "numeric", "string", "date"]
Expand All @@ -34,6 +38,7 @@ def __init__( # noqa: PLR0913
data_type: str,
nominal_values: list[str],
number_missing_values: int,
ontologies: list[str] | None = None,
):
if not isinstance(index, int):
raise TypeError(f"Index must be `int` but is {type(index)}")
Expand Down Expand Up @@ -65,6 +70,7 @@ def __init__( # noqa: PLR0913
self.index = index
self.name = str(name)
self.data_type = str(data_type)
self.ontologies = ontologies
self.nominal_values = nominal_values
self.number_missing_values = number_missing_values

Expand Down
1 change: 1 addition & 0 deletions openml/datasets/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -1069,6 +1069,7 @@ def _parse_features_xml(features_xml_string: str) -> dict[int, OpenMLDataFeature
xmlfeature["oml:data_type"],
xmlfeature.get("oml:nominal_value"),
int(nr_missing),
xmlfeature.get("oml:ontology"),
)
if idx != feature.index:
raise ValueError("Data features not provided in right order")
Expand Down
57 changes: 57 additions & 0 deletions openml/datasets/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -1061,6 +1061,63 @@ def fork_dataset(data_id: int) -> int:
return int(data_id)


def data_feature_add_ontology(data_id: int, index: int, ontology: str) -> bool:
"""
An ontology describes the concept that are described in a feature. An
ontology is defined by an URL where the information is provided. Adds
an ontology (URL) to a given dataset feature (defined by a dataset id
and index). The dataset has to exists on OpenML and needs to have been
processed by the evaluation engine.

Parameters
----------
data_id : int
id of the dataset to which the feature belongs

index : int
index of the feature in dataset (0-based)

ontology : str
URL to ontology (max. 256 characters)

Returns
-------
True or throws an OpenML server exception
"""
upload_data = {"data_id": data_id, "index": index, "ontology": ontology}
openml._api_calls._perform_api_call("data/feature/ontology/add", "post", data=upload_data)
# an error will be thrown in case the request was unsuccessful
return True


def data_feature_remove_ontology(data_id: int, index: int, ontology: str) -> bool:
"""
Removes an existing ontology (URL) from a given dataset feature (defined
by a dataset id and index). The dataset has to exists on OpenML and needs
to have been processed by the evaluation engine. Ontology needs to be
attached to the specific fearure.

Parameters
----------
data_id : int
id of the dataset to which the feature belongs

index : int
index of the feature in dataset (0-based)

ontology : str
URL to ontology (max. 256 characters)

Returns
-------
True or throws an OpenML server exception
"""
upload_data = {"data_id": data_id, "index": index, "ontology": ontology}
openml._api_calls._perform_api_call("data/feature/ontology/remove", "post", data=upload_data)
# an error will be thrown in case the request was unsuccessful
return True


def _topic_add_dataset(data_id: int, topic: str) -> int:
"""
Adds a topic for a dataset.
Expand Down
1,064 changes: 1,064 additions & 0 deletions tests/files/org/openml/test/datasets/1/dataset.arff

Large diffs are not rendered by default.

130 changes: 130 additions & 0 deletions tests/files/org/openml/test/datasets/1/description.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
<oml:data_set_description xmlns:oml="http://openml.org/openml">
<oml:id>1</oml:id>
<oml:name>anneal</oml:name>
<oml:version>1</oml:version>
<oml:description>**Author**:
**Source**: Unknown -
**Please cite**:

1. Title of Database: Annealing Data

2. Source Information: donated by David Sterling and Wray Buntine.

3. Past Usage: unknown

4. Relevant Information:
-- Explanation: I suspect this was left by Ross Quinlan in 1987 at the
4th Machine Learning Workshop. I'd have to check with Jeff Schlimmer
to double check this.

5. Number of Instances: 798

6. Number of Attributes: 38
-- 6 continuously-valued
-- 3 integer-valued
-- 29 nominal-valued

7. Attribute Information:
1. family: --,GB,GK,GS,TN,ZA,ZF,ZH,ZM,ZS
2. product-type: C, H, G
3. steel: -,R,A,U,K,M,S,W,V
4. carbon: continuous
5. hardness: continuous
6. temper_rolling: -,T
7. condition: -,S,A,X
8. formability: -,1,2,3,4,5
9. strength: continuous
10. non-ageing: -,N
11. surface-finish: P,M,-
12. surface-quality: -,D,E,F,G
13. enamelability: -,1,2,3,4,5
14. bc: Y,-
15. bf: Y,-
16. bt: Y,-
17. bw/me: B,M,-
18. bl: Y,-
19. m: Y,-
20. chrom: C,-
21. phos: P,-
22. cbond: Y,-
23. marvi: Y,-
24. exptl: Y,-
25. ferro: Y,-
26. corr: Y,-
27. blue/bright/varn/clean: B,R,V,C,-
28. lustre: Y,-
29. jurofm: Y,-
30. s: Y,-
31. p: Y,-
32. shape: COIL, SHEET
33. thick: continuous
34. width: continuous
35. len: continuous
36. oil: -,Y,N
37. bore: 0000,0500,0600,0760
38. packing: -,1,2,3
classes: 1,2,3,4,5,U

-- The '-' values are actually 'not_applicable' values rather than
'missing_values' (and so can be treated as legal discrete
values rather than as showing the absence of a discrete value).

8. Missing Attribute Values: Signified with &quot;?&quot;
Attribute: Number of instances missing its value:
1 0
2 0
3 70
4 0
5 0
6 675
7 271
8 283
9 0
10 703
11 790
12 217
13 785
14 797
15 680
16 736
17 609
18 662
19 798
20 775
21 791
22 730
23 798
24 796
25 772
26 798
27 793
28 753
29 798
30 798
31 798
32 0
33 0
34 0
35 0
36 740
37 0
38 789
39 0

9. Distribution of Classes
Class Name: Number of Instances:
1 8
2 88
3 608
4 0
5 60
U 34
---
798</oml:description>
<oml:description_version>1</oml:description_version>
<oml:format>ARFF</oml:format>
<oml:upload_date>2014-04-06T23:19:24</oml:upload_date>
<oml:licence>Public</oml:licence> <oml:url>https://test.openml.org/data/v1/download/1/anneal.arff</oml:url>
<oml:file_id>1</oml:file_id> <oml:default_target_attribute>class</oml:default_target_attribute> <oml:version_label>1</oml:version_label> <oml:tag>study_14</oml:tag><oml:tag>test_lazy_tag_107378</oml:tag><oml:tag>test_lazy_tag_120303</oml:tag><oml:tag>test_lazy_tag_131210</oml:tag><oml:tag>test_lazy_tag_254767</oml:tag><oml:tag>test_lazy_tag_38672</oml:tag><oml:tag>test_lazy_tag_452011</oml:tag><oml:tag>test_lazy_tag_45600</oml:tag><oml:tag>test_lazy_tag_574410</oml:tag><oml:tag>test_lazy_tag_624164</oml:tag><oml:tag>test_lazy_tag_834703</oml:tag><oml:tag>test_lazy_tag_898124</oml:tag><oml:tag>test_lazy_tag_959534</oml:tag> <oml:visibility>public</oml:visibility> <oml:status>active</oml:status>
<oml:processing_date>2024-01-10 13:50:55</oml:processing_date> <oml:md5_checksum>4eaed8b6ec9d8211024b6c089b064761</oml:md5_checksum>
</oml:data_set_description>
Loading