Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion deployment/run_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def run_integration_tests(coverage=False, verbose=False, timeout_seconds=900):
"""Run integration tests."""
logger.info("Running integration tests...")

command = f"{sys.executable} -m pytest tests/integration/"
command = f"{sys.executable} -m pytest tests/"

if coverage:
command += " --cov=server"
Expand Down
2 changes: 1 addition & 1 deletion server/python_backend/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,7 +253,7 @@ async def create_email(
ai_analysis = await ai_engine.analyze_email(email.subject, email.content)

# Apply smart filters
filter_results = await filter_manager.apply_filters_to_email(email.dict())
filter_results = await filter_manager.apply_filters_to_email_data(email.dict()) # Corrected method name

Comment on lines 255 to 257
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Awaiting a non-coroutine will raise a runtime TypeError.

SmartFilterManager.apply_filters_to_email_data is a synchronous method (see smart_filters.py).
await expects a coroutine / awaitable; in production this line will blow up once the patch-based tests are removed.

-        filter_results = await filter_manager.apply_filters_to_email_data(email.dict())  # Corrected method name
+        filter_results = filter_manager.apply_filters_to_email_data(email.dict())

Alternatively, convert apply_filters_to_email_data to async def if you truly need asynchronous behaviour.

📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
# Apply smart filters
filter_results = await filter_manager.apply_filters_to_email(email.dict())
filter_results = await filter_manager.apply_filters_to_email_data(email.dict()) # Corrected method name
# Apply smart filters
filter_results = filter_manager.apply_filters_to_email_data(email.dict()) # Corrected method name
🤖 Prompt for AI Agents
In server/python_backend/main.py around lines 255 to 257, the code incorrectly
uses 'await' on the synchronous method 'apply_filters_to_email_data' from
SmartFilterManager, which will cause a runtime TypeError. To fix this, remove
the 'await' keyword before the method call since it is not a coroutine.
Alternatively, if asynchronous behavior is required, refactor
'apply_filters_to_email_data' in smart_filters.py to be an async function and
update its implementation accordingly.

# Create email with enhanced data
email_data = email.dict()
Expand Down
26 changes: 23 additions & 3 deletions server/python_nlp/action_item_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,11 @@
# Attempt to import NLTK for POS tagging
try:
import nltk
# Check if necessary NLTK data is available (downloads should be handled by launch.py)
# Initial check for base resources. More specific checks can be done in class __init__.
nltk.data.find('taggers/averaged_perceptron_tagger')
nltk.data.find('tokenizers/punkt')
HAS_NLTK = True
except (ImportError, nltk.downloader.ErrorMessage): # Catch both import error and find error
except (ImportError, LookupError): # Catch both import error and lookup error for missing NLTK data
HAS_NLTK = False

logger = logging.getLogger(__name__)
Expand All @@ -21,15 +21,35 @@ class ActionItemExtractor:
"""

def __init__(self):
if HAS_NLTK:
resources_to_check = {
"punkt_tab": "tokenizers/punkt_tab",
"averaged_perceptron_tagger": "taggers/averaged_perceptron_tagger",
"averaged_perceptron_tagger_eng": "taggers/averaged_perceptron_tagger_eng", # Added for explicit check
"punkt": "tokenizers/punkt" # Redundant with top-level check but good for robustness
}
for resource_name, resource_path in resources_to_check.items():
try:
nltk.data.find(resource_path)
except LookupError:
logger.info(f"NLTK '{resource_name}' resource ({resource_path}) not found. Downloading...")
try:
nltk.download(resource_name, quiet=True)
except Exception as e_download:
logger.error(f"An error occurred while downloading '{resource_name}': {e_download}")
except Exception as e:
logger.error(f"An unexpected error occurred while checking for '{resource_name}': {e}")

# Regex for keywords indicating action items
self.action_keywords_regex = re.compile(
r'\b(please|task:|action:|need to|required to|must|should|can you|could you|will you)\b',
r'\b(task:|action required:|action:)\s|\b(please|need to|required to|must|should|can you|could you|will you)\b',
re.IGNORECASE
)
# Regex for simple due date patterns
# This is a basic version and can be expanded significantly
self.due_date_regex = re.compile(
r'\b(by (next )?(monday|tuesday|wednesday|thursday|friday|saturday|sunday|tomorrow|end of day|eod)|'
r'on (monday|tuesday|wednesday|thursday|friday|saturday|sunday)|' # Added for "on DayName"
r'on \d{1,2}(st|nd|rd|th)? (jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)(\w*)?(\s\d{4})?|'
r'in \d+ (days?|weeks?|months?)|'
r'next (week|month|year))\b',
Expand Down
2 changes: 1 addition & 1 deletion server/python_nlp/data_strategy.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import re
import json
import hashlib
from typing import Dict, List, tuple, Optional, Any
from typing import Dict, List, Optional, Any
from dataclasses import dataclass, asdict
from datetime import datetime
import logging
Expand Down
2 changes: 1 addition & 1 deletion server/python_nlp/gmail_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import time
import json
import sqlite3
from typing import Dict, List, Optional, tuple, Any
from typing import Dict, List, Optional, Any
from dataclasses import dataclass
from datetime import datetime, timedelta
import logging
Expand Down
3 changes: 2 additions & 1 deletion server/python_nlp/gmail_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,8 @@ def _get_basic_fallback_analysis_structure(self, error_message: str) -> Dict[str
'error': error_message, 'topic': 'unknown', 'sentiment': 'neutral',
'intent': 'unknown', 'urgency': 'low', 'confidence': 0.0,
'keywords': [], 'categories': ['unknown'], 'reasoning': f'Fallback due to error: {error_message}',
'suggested_labels': ['unknown'], 'risk_flags': ['analysis_failed']
'suggested_labels': ['unknown'], 'risk_flags': ['analysis_failed'],
'action_items': [] # Added missing action_items
}

def _convert_to_db_format(self, gmail_metadata: GmailMessage, ai_analysis_result: Optional[Dict[str, Any]]) -> Dict[str, Any]:
Expand Down
41 changes: 29 additions & 12 deletions server/python_nlp/nlp_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,11 +42,11 @@
)

# Define paths for pre-trained models
MODEL_DIR = os.getenv("NLP_MODEL_DIR", os.path.dirname(__file__))
SENTIMENT_MODEL_PATH = os.path.join(MODEL_DIR, "sentiment_model.pkl")
TOPIC_MODEL_PATH = os.path.join(MODEL_DIR, "topic_model.pkl")
INTENT_MODEL_PATH = os.path.join(MODEL_DIR, "intent_model.pkl")
URGENCY_MODEL_PATH = os.path.join(MODEL_DIR, "urgency_model.pkl")
# MODEL_DIR = os.getenv("NLP_MODEL_DIR", os.path.dirname(__file__)) # Moved to __init__
# SENTIMENT_MODEL_PATH = os.path.join(MODEL_DIR, "sentiment_model.pkl")
# TOPIC_MODEL_PATH = os.path.join(MODEL_DIR, "topic_model.pkl")
# INTENT_MODEL_PATH = os.path.join(MODEL_DIR, "intent_model.pkl")
# URGENCY_MODEL_PATH = os.path.join(MODEL_DIR, "urgency_model.pkl")


class NLPEngine:
Expand All @@ -59,8 +59,24 @@ class NLPEngine:

def __init__(self):
"""Initialize the NLP engine and load required models."""

# Define model paths dynamically using env var at instantiation time
model_dir = os.getenv("NLP_MODEL_DIR", os.path.dirname(__file__))
self.sentiment_model_path = os.path.join(model_dir, "sentiment_model.pkl")
self.topic_model_path = os.path.join(model_dir, "topic_model.pkl")
self.intent_model_path = os.path.join(model_dir, "intent_model.pkl")
self.urgency_model_path = os.path.join(model_dir, "urgency_model.pkl")

# Initialize stop words if NLTK is available
self.stop_words = set(nltk.corpus.stopwords.words('english')) if HAS_NLTK else set()
if HAS_NLTK:
try:
nltk.data.find('corpora/stopwords')
except LookupError:
logger.info("NLTK 'stopwords' resource not found. Downloading...")
nltk.download('stopwords', quiet=True)
self.stop_words = set(nltk.corpus.stopwords.words('english'))
else:
self.stop_words = set()

# Initialize model attributes
self.sentiment_model = None
Expand All @@ -74,10 +90,10 @@ def __init__(self):
# Load models if dependencies are available
if HAS_SKLEARN_AND_JOBLIB:
logger.info("Attempting to load NLP models...")
self.sentiment_model = self._load_model(SENTIMENT_MODEL_PATH)
self.topic_model = self._load_model(TOPIC_MODEL_PATH)
self.intent_model = self._load_model(INTENT_MODEL_PATH)
self.urgency_model = self._load_model(URGENCY_MODEL_PATH)
self.sentiment_model = self._load_model(self.sentiment_model_path)
self.topic_model = self._load_model(self.topic_model_path)
self.intent_model = self._load_model(self.intent_model_path)
self.urgency_model = self._load_model(self.urgency_model_path)
else:
logger.warning(
"Scikit-learn or joblib not available. "
Expand Down Expand Up @@ -347,15 +363,16 @@ def _analyze_topic_keyword(self, text: str) -> Dict[str, Any]:
# Calculate confidence score
# Using a simple heuristic: matched_keywords / 5.0 (capped at 0.9)
confidence = min(topic_scores[best_topic] / 5.0, 0.9)
normalized_topic = best_topic.lower().replace(" & ", "_").replace(" ", "_")

return {
'topic': best_topic,
'topic': normalized_topic, # Normalized topic
'confidence': max(0.1, confidence),
'method_used': 'fallback_keyword_topic'
}
else:
return {
'topic': 'General',
'topic': 'general_communication', # Consistent fallback topic name
'confidence': 0.5,
'method_used': 'fallback_keyword_topic'
}
Expand Down
41 changes: 32 additions & 9 deletions server/python_nlp/smart_filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,18 +48,32 @@ class SmartFilterManager:
def __init__(self, db_path: str = "smart_filters.db"):
self.db_path = db_path
self.logger = logging.getLogger(__name__)
self.conn = None # For persistent in-memory connection

if self.db_path == ":memory:":
self.conn = sqlite3.connect(":memory:")
self.conn.row_factory = sqlite3.Row
Comment on lines +53 to +55
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

suggestion (bug_risk): Make in-memory SQLite connection thread-safe if shared across threads

Consider setting check_same_thread=False when creating the connection, or use a separate connection for each thread to avoid threading errors.

Suggested change
if self.db_path == ":memory:":
self.conn = sqlite3.connect(":memory:")
self.conn.row_factory = sqlite3.Row
if self.db_path == ":memory:":
# Allow the in-memory SQLite connection to be shared across threads
self.conn = sqlite3.connect(":memory:", check_same_thread=False)
self.conn.row_factory = sqlite3.Row


self._init_filter_db()

# Filter categories and templates
self.filter_templates = self._load_filter_templates()
self.pruning_criteria = self._load_pruning_criteria()

def _get_db_connection(self) -> sqlite3.Connection:
"""Get a new database connection."""
"""Get a database connection."""
if self.conn: # Use persistent in-memory connection if it exists
return self.conn
# For file-based DBs, create a new connection each time (or manage a pool)
conn = sqlite3.connect(self.db_path)
conn.row_factory = sqlite3.Row # Access columns by name
conn.row_factory = sqlite3.Row
return conn

def _close_db_connection(self, conn: sqlite3.Connection):
"""Close connection if it's not the persistent in-memory one."""
if conn is not self.conn:
conn.close()

Comment on lines +52 to +76
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion

Persistent in-memory connection logic is sound but leaks for file DBs

_get_db_connection opens a new connection for every call when db_path":memory:" and _db_execute/​fetch* close it afterwards – fine.
However, long-running loops that call these helpers thousands of times will continually open/close SQLite handles which is slow.

Consider:

  1. Keeping a single cached connection per file DB (same pattern you applied to ":memory:").
  2. Or wrapping calls in a context-manager when bulk operations are needed.

Otherwise performance degradation is likely for heavy filter traffic.

🤖 Prompt for AI Agents
In server/python_nlp/smart_filters.py around lines 52 to 76, the current code
opens and closes a new SQLite connection on every call for file-based databases,
causing performance issues during heavy usage. To fix this, implement a
persistent cached connection for file-based DBs similar to the in-memory
connection by storing and reusing a single connection instance. Alternatively,
provide a context manager for bulk operations to manage connections efficiently
and avoid frequent open/close cycles.

def _db_execute(self, query: str, params: tuple = ()):
"""Execute a query (INSERT, UPDATE, DELETE)."""
conn = self._get_db_connection()
Expand All @@ -70,7 +84,7 @@ def _db_execute(self, query: str, params: tuple = ()):
self.logger.error(f"Database error: {e} with query: {query[:100]}")
# Optionally re-raise or handle
finally:
conn.close()
self._close_db_connection(conn)

def _db_fetchone(self, query: str, params: tuple = ()) -> Optional[sqlite3.Row]:
"""Execute a query and fetch one row."""
Expand All @@ -82,7 +96,7 @@ def _db_fetchone(self, query: str, params: tuple = ()) -> Optional[sqlite3.Row]:
self.logger.error(f"Database error: {e} with query: {query[:100]}")
return None
finally:
conn.close()
self._close_db_connection(conn)

def _db_fetchall(self, query: str, params: tuple = ()) -> List[sqlite3.Row]:
"""Execute a query and fetch all rows."""
Expand All @@ -94,7 +108,7 @@ def _db_fetchall(self, query: str, params: tuple = ()) -> List[sqlite3.Row]:
self.logger.error(f"Database error: {e} with query: {query[:100]}")
return []
finally:
conn.close()
self._close_db_connection(conn)

def _init_filter_db(self):
"""Initialize filter database"""
Expand Down Expand Up @@ -151,12 +165,20 @@ def _init_filter_db(self):
conn = self._get_db_connection()
try:
for query in queries:
conn.execute(query)
conn.commit()
conn.execute(query) # Uses the connection from _get_db_connection
conn.commit() # Commit on the same connection
except sqlite3.Error as e:
self.logger.error(f"Database initialization error: {e}")
finally:
conn.close()
# Do not close if it's the persistent self.conn
self._close_db_connection(conn)


def close(self):
"""Close the persistent database connection if it exists."""
if self.conn:
self.conn.close()
self.conn = None

def _load_filter_templates(self) -> Dict[str, Dict[str, Any]]:
"""Load intelligent filter templates"""
Expand Down Expand Up @@ -1059,7 +1081,8 @@ def _save_filter_performance(self, performance: FilterPerformance):
f1_score, processing_time_ms, emails_processed, true_positives, false_positives, false_negatives)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
"""
performance_id = f"{performance.filter_id}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
# Added microseconds for more unique ID
performance_id = f"{performance.filter_id}_{datetime.now().strftime('%Y%m%d_%H%M%S_%f')}"
params = (
performance_id,
performance.filter_id,
Expand Down
33 changes: 33 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import pytest
from unittest.mock import AsyncMock
from server.python_backend.main import app # Assuming 'app' is your FastAPI instance
from server.python_backend.database import get_db # The actual dependency

@pytest.fixture(scope="session", autouse=True)
def mock_db_session_override():
# Create a single AsyncMock instance for the session
mock_db = AsyncMock()
# Pre-configure any commonly returned objects or default behaviors if necessary
# For example, if many methods return a list:
# mock_db.get_all_categories = AsyncMock(return_value=[])
# mock_db.get_emails = AsyncMock(return_value=[])
# ... etc. Specific tests can override these.

# This is the dependency that needs to be overridden
# Ensure this path is correct for your project structure
original_get_db_override = app.dependency_overrides.get(get_db) # Store original override, if any

# Define the override function that will return our session-scoped mock
async def override_get_db_for_session():
return mock_db

Comment on lines +21 to +23
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Dependency override should yield, not return

get_db in most FastAPI back-ends is an async generator (yield session).
Returning the mock directly breaks the dependency-teardown cycle and causes:

RuntimeError: coroutine was never awaited when the overridden dependency is awaited as an async-gen.
• Suppressed fixture finalisation for db sessions that rely on yield.

Fix:

-async def override_get_db_for_session():
-    return mock_db
+async def override_get_db_for_session():
+    yield mock_db
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
async def override_get_db_for_session():
return mock_db
async def override_get_db_for_session():
yield mock_db
🤖 Prompt for AI Agents
In tests/conftest.py around lines 21 to 23, the override_get_db_for_session
function incorrectly returns the mock_db instead of yielding it, which breaks
the async generator pattern expected by FastAPI dependencies. Change the
function to use 'yield mock_db' instead of 'return mock_db' to properly support
the async generator behavior and ensure correct dependency teardown and
avoidance of runtime errors.

app.dependency_overrides[get_db] = override_get_db_for_session

yield mock_db # Provide the mock to tests if they request it by this fixture name

# Teardown: Restore original dependency override if it existed, or clear it
if original_get_db_override:
app.dependency_overrides[get_db] = original_get_db_override
else:
if get_db in app.dependency_overrides: # Check if key exists before deleting
del app.dependency_overrides[get_db]
23 changes: 11 additions & 12 deletions tests/test_action_item_extractor.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,11 @@
import unittest
from unittest.mock import patch
# from server.python_nlp.action_item_extractor import ActionItemExtractor, HAS_NLTK # Commented out for debug
HAS_NLTK = False # Stubbing HAS_NLTK
from server.python_nlp.action_item_extractor import ActionItemExtractor, HAS_NLTK

class TestActionItemExtractor(unittest.TestCase):

def setUp(self):
# self.extractor = ActionItemExtractor() # Commented out for debug
self.extractor = None # Placeholder
self.extractor = ActionItemExtractor()

def test_extract_actions_clear_phrase_with_due_date(self):
text = "Please review the attached document by Friday."
Expand Down Expand Up @@ -76,15 +74,16 @@ def test_extract_actions_multiple_action_items(self):


def test_extract_actions_simple_due_date_tomorrow(self):
text = "Submit the expenses by tomorrow."
actions = self.extractor.extract_actions(text)
self.assertEqual(len(actions), 1) # Assuming "Submit" isn't a keyword, but "by tomorrow" is part of sentence
# The current extractor triggers on keywords like "please", "need to" etc.
# Let's adjust the text to include a keyword.
text_no_keyword = "Submit the expenses by tomorrow."
actions_no_keyword = self.extractor.extract_actions(text_no_keyword)
self.assertEqual(len(actions_no_keyword), 0) # Text without keyword should not produce action

# Now test with a keyword
text_with_keyword = "You should Submit the expenses by tomorrow."
actions = self.extractor.extract_actions(text_with_keyword)
self.assertEqual(len(actions), 1)
self.assertEqual(actions[0]['raw_due_date_text'], "by tomorrow")
actions_with_keyword = self.extractor.extract_actions(text_with_keyword)
self.assertEqual(len(actions_with_keyword), 1)
Comment on lines +77 to +84
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

issue (code-quality): Extract duplicate code into method (extract-duplicate-method)

self.assertEqual(actions_with_keyword[0]['raw_due_date_text'], "by tomorrow")
self.assertTrue(actions_with_keyword[0]['action_phrase'].startswith("should Submit the expenses"))

def test_extract_actions_due_date_on_monday(self):
text = "We need to finish this on Monday."
Expand Down
Loading