diff --git a/.github/workflows/core-install.yml b/.github/workflows/core-install.yml index 850633b..0d587e8 100644 --- a/.github/workflows/core-install.yml +++ b/.github/workflows/core-install.yml @@ -1,7 +1,4 @@ -# This workflow will install Python dependencies, run tests and lint with a single version of Python -# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions - -name: Build +name: Build and Test on: push: @@ -11,15 +8,14 @@ on: jobs: build: - runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Set up Python 3.11 - uses: actions/setup-python@v2 + uses: actions/setup-python@v4 with: - python-version: 3.11 + python-version: '3.11' - name: Install dependencies run: | python -m pip install --upgrade pip @@ -30,7 +26,7 @@ jobs: # stop the build if there are Python syntax errors or undefined names flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide - flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + flake8 . --count --exit-zero --max-complexity=15 --max-line-length=127 --statistics - name: Test with pytest run: | - cd tests; py.test; + python -m pytest tests/ -v diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml index 0f6798b..006f0b5 100644 --- a/.github/workflows/pylint.yml +++ b/.github/workflows/pylint.yml @@ -1,19 +1,18 @@ -name: Py 3.11 +name: Python 3.11 on: [push] jobs: build: - runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Set up Python 3.11 - uses: actions/setup-python@v2 + uses: actions/setup-python@v4 with: - python-version: 3.11 + python-version: '3.11' - name: Install dependencies run: | diff --git a/autoBOTLib/__init__.py b/autoBOTLib/__init__.py index 4f053bc..9424a3c 100644 --- a/autoBOTLib/__init__.py +++ b/autoBOTLib/__init__.py @@ -1,3 +1,21 @@ +import os +import logging +import nltk + +# Configure logging first +logging.basicConfig(format='%(asctime)s - %(message)s', + datefmt='%d-%b-%y %H:%M:%S') +logging.getLogger().setLevel(logging.INFO) + +# Set environment variables +os.environ['TOKENIZERS_PARALLELISM'] = "false" + +# Download NLTK resources +nltk.download('stopwords', quiet=True) +nltk.download('punkt_tab', quiet=True) +nltk.download('averaged_perceptron_tagger_eng', quiet=True) + +# Import all module functionality from autoBOTLib.features.features_keyword import * from autoBOTLib.features.features_contextual import * from autoBOTLib.features.features_token_relations import * @@ -10,17 +28,3 @@ from autoBOTLib.optimization.optimization_feature_constructors import * from autoBOTLib.optimization.optimization_engine import * from autoBOTLib.misc.misc_helpers import * - -import nltk -nltk.download('stopwords', quiet=True) -nltk.download('punkt_tab', quiet=True) -nltk.download('averaged_perceptron_tagger_eng', quiet=True) - -import os -import logging - -logging.basicConfig(format='%(asctime)s - %(message)s', - datefmt='%d-%b-%y %H:%M:%S') -logging.getLogger().setLevel(logging.INFO) - -os.environ['TOKENIZERS_PARALLELISM'] = "false" diff --git a/autoBOTLib/__main__.py b/autoBOTLib/__main__.py index 75d5f71..cffdbcd 100644 --- a/autoBOTLib/__main__.py +++ b/autoBOTLib/__main__.py @@ -45,8 +45,7 @@ def main(): "--framework", default="scikit", type=str, - help= - "The computational ML back-end to use. Currently supports scikit (Default) and pyTorch (neural nets for sparse inputs)" + help="The computational ML back-end to use. Currently supports scikit (Default) and pyTorch (neural nets for sparse inputs)" ) parser.add_argument("--memory_storage", default="memory", type=str) parser.add_argument("--sparsity", default=0.05, type=float) diff --git a/autoBOTLib/misc/misc_keyword_detection.py b/autoBOTLib/misc/misc_keyword_detection.py index 58c6945..3a8241c 100644 --- a/autoBOTLib/misc/misc_keyword_detection.py +++ b/autoBOTLib/misc/misc_keyword_detection.py @@ -80,10 +80,8 @@ def corpus_graph(self, def process_line(line): - nonlocal G nonlocal ctx nonlocal reps - nonlocal dictionary_with_counts_of_pairs stop = list(string.punctuation) line = line.strip() diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..11c1325 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,50 @@ +[project] +name = "autoBOTLib" +description = "AutoBOT: Explainable AutoML for texts" +authors = [ + {name = "Blaž Škrlj", email = "blaz.skrlj@ijs.si"}, +] +license = {text = "BSD-3-Clause-Clear"} +readme = "README.md" +requires-python = ">=3.8" +dynamic = ["version", "dependencies", "classifiers", "scripts"] + +[build-system] +requires = ["setuptools>=61.0", "wheel"] +build-backend = "setuptools.build_meta" + +[tool.flake8] +max-line-length = 127 +max-complexity = 15 +ignore = [ + "E402", # module level import not at top of file (temporarily ignore due to NLTK downloads) + "F401", # imported but unused (temporarily ignore due to wildcard imports) + "F403", # star import used (temporarily ignore while maintaining compatibility) + "W503", # line break before binary operator +] +exclude = [ + ".git", + "__pycache__", + ".pytest_cache", + "build", + "dist", + "*.egg-info", + ".tox", + ".venv", +] + +[tool.pytest.ini_options] +testpaths = ["tests"] +python_files = ["test_*.py", "*_test.py"] +python_classes = ["Test*"] +python_functions = ["test_*"] +addopts = [ + "-v", + "--tb=short", +] + +[tool.isort] +profile = "black" +multi_line_output = 3 +line_length = 127 +known_first_party = ["autoBOTLib"] \ No newline at end of file diff --git a/setup.py b/setup.py index cc784ac..fd3b4e2 100644 --- a/setup.py +++ b/setup.py @@ -1,39 +1,22 @@ -from os import path +from pathlib import Path from setuptools import setup, find_packages -from setuptools.command.install import install -import subprocess -import sys + def parse_requirements(file): + """Parse requirements from requirements.txt file.""" required_packages = [] - with open(path.join(path.dirname(__file__), file)) as req_file: - for line in req_file: - # Exclude any comments or empty lines - line = line.strip() - if line and not line.startswith("#"): - required_packages.append(line) + requirements_path = Path(__file__).parent / file + try: + with open(requirements_path) as req_file: + for line in req_file: + # Exclude any comments or empty lines + line = line.strip() + if line and not line.startswith("#"): + required_packages.append(line) + except FileNotFoundError: + print(f"Warning: {file} not found. Using default requirements.") return required_packages -class PostInstallCommand(install): - """Post-installation for downloading NLTK resources.""" - def run(self): - install.run(self) - - try: - import nltk - except ImportError: - print("NLTK is not installed. Installing NLTK...") - subprocess.check_call([sys.executable, "-m", "pip", "install", "nltk"]) - import nltk - try: - print("Downloading NLTK 'stopwords' resource...") - for lib in ['stopwords', 'punkt_tab', 'averaged_perceptron_tagger_eng']: - subprocess.check_call([sys.executable, "-m", "nltk.downloader", lib]) - print(f"NLTK {lib} downloaded successfully.") - except subprocess.CalledProcessError as e: - print(f"Failed to download NLTK 'stopwords': {e}") - sys.exit(1) # Exit with error code - long_description = """ autoBOT is an AutoML system for text classification with an emphasis on explainability. It implements the idea of *representation evolution*, learning to combine representations @@ -58,5 +41,19 @@ def run(self): packages=packages, zip_safe=False, include_package_data=True, - install_requires=parse_requirements("requirements.txt") + install_requires=parse_requirements("requirements.txt"), + classifiers=[ + "Development Status :: 4 - Beta", + "Intended Audience :: Developers", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: BSD License", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Topic :: Scientific/Engineering :: Artificial Intelligence", + "Topic :: Text Processing :: Linguistic", + ], + python_requires=">=3.8", ) diff --git a/tests/minimal_functionality_test.py b/tests/minimal_functionality_test.py index c611e1c..3388dc9 100644 --- a/tests/minimal_functionality_test.py +++ b/tests/minimal_functionality_test.py @@ -6,11 +6,12 @@ from sklearn.feature_extraction.text import TfidfVectorizer from sklearn import pipeline ## A necessary import import pytest +import os def test_minimal_mlc(): ## Load example data frame - dataframe = pd.read_csv("../data/insults/train.tsv", sep="\t") + dataframe = pd.read_csv("data/insults/train.tsv", sep="\t") train_sequences = dataframe['text_a'] train_targets_c1 = dataframe['label'].values.tolist() train_targets_c2 = [ @@ -32,7 +33,7 @@ def test_minimal_mlc(): strategy="direct-learning" ) ## strategy = "direct-learning" trains a single learner. - dataframe2 = pd.read_csv("../data/insults/test.tsv", sep="\t") + dataframe2 = pd.read_csv("data/insults/test.tsv", sep="\t") test_sequences = dataframe2['text_a'] predictions = autoBOTLibObj.predict(test_sequences) prob_predictions = autoBOTLibObj.predict_proba(test_sequences) @@ -45,7 +46,7 @@ def test_minimal_mlc(): def test_minimal(): ## Load example data frame - dataframe = pd.read_csv("../data/insults/train.tsv", sep="\t").iloc[:500] + dataframe = pd.read_csv("data/insults/train.tsv", sep="\t").iloc[:500] train_sequences = dataframe['text_a'] train_targets = dataframe['label'] @@ -63,7 +64,7 @@ def test_minimal(): strategy="evolution" ) ## strategy = "direct-learning" trains a single learner. - dataframe2 = pd.read_csv("../data/insults/test.tsv", sep="\t") + dataframe2 = pd.read_csv("data/insults/test.tsv", sep="\t") test_sequences = dataframe2['text_a'] predictions = autoBOTLibObj.predict(test_sequences) prob_predictions = autoBOTLibObj.predict_proba(test_sequences) @@ -82,7 +83,7 @@ def test_minimal(): def test_initializations(fold_number, representation_type, sparsity, time_constraint): - dataframe = pd.read_csv("../data/insults/train.tsv", sep="\t") + dataframe = pd.read_csv("data/insults/train.tsv", sep="\t") train_sequences = dataframe['text_a'] train_targets = dataframe['label'] autoBOTLibObj = autoBOTLib.GAlearner(