Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion tools/citation-manager/src/ParsedDocument.ts
Original file line number Diff line number Diff line change
Expand Up @@ -260,7 +260,9 @@ class ParsedDocument {
.replace(/\^/g, "") // Remove caret
.replace(/%%/g, "") // Remove comment markers
.replace(/\[\[/g, "") // Remove wiki open
.replace(/\]\]/g, ""); // Remove wiki close
.replace(/\]\]/g, "") // Remove wiki close
.replace(/\s+/g, " ") // Collapse whitespace (matches URL-encoded anchor generation)
.trim();
}

/**
Expand Down
69 changes: 69 additions & 0 deletions tools/citation-manager/test/parsed-document-extraction.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -277,6 +277,75 @@ describe("ParsedDocument Content Extraction", () => {
expect(section).toContain("Test: A | B ^ C");
});

it("should match URL-decoded anchor with collapsed whitespace against heading with colon (Issue #14)", () => {
// Given: Document with heading "Level 3: Components"
// URL-encoded anchor: "Level%203%20Components" (colon removed, spaces collapsed)
// After decodeUrlAnchor: "Level 3 Components" (single space)
// Heading after colon removal: "Level 3 Components" (double space from ": " → " ")
// Bug: These don't match because normalization doesn't collapse whitespace
const parserOutput = {
content: "## Level 3: Components\n\nComponent details here.\n\n## Next Section",
tokens: [
{
type: "heading",
depth: 2,
text: "Level 3: Components",
raw: "## Level 3: Components\n",
},
{ type: "paragraph", raw: "\nComponent details here.\n\n" },
{
type: "heading",
depth: 2,
text: "Next Section",
raw: "## Next Section",
},
],
headings: [
{ text: "Level 3: Components", level: 2 },
{ text: "Next Section", level: 2 },
],
};
const doc = new ParsedDocument(parserOutput);

// When: Extract using decoded URL anchor (single space, as ContentExtractor does)
const section = doc.extractSection("Level 3 Components");

// Then: Should match the heading and extract section
expect(section).not.toBeNull();
expect(section).toContain("Level 3: Components");
expect(section).toContain("Component details here");
expect(section).not.toContain("Next Section");
});

it("should match URL-decoded anchor with collapsed whitespace for multiple special chars (Issue #14)", () => {
// Given: Heading with multiple Obsidian-invalid chars creating whitespace gaps
// "Story 1.5: Cache | Design" → remove : and | → "Story 1.5 Cache Design"
// But URL-encoded form collapses: "Story%201.5%20Cache%20Design" → decoded: "Story 1.5 Cache Design"
const parserOutput = {
content: "## Story 1.5: Cache | Design\n\nDesign content.",
tokens: [
{
type: "heading",
depth: 2,
text: "Story 1.5: Cache | Design",
raw: "## Story 1.5: Cache | Design\n",
},
{ type: "paragraph", raw: "\nDesign content." },
],
headings: [
{ text: "Story 1.5: Cache | Design", level: 2 },
],
};
const doc = new ParsedDocument(parserOutput);

// When: Extract using URL-decoded form (whitespace collapsed)
const section = doc.extractSection("Story 1.5 Cache Design");

// Then: Should match
expect(section).not.toBeNull();
expect(section).toContain("Story 1.5: Cache | Design");
});

it("should still match headings without Obsidian invalid characters (backward compatibility)", () => {
// Given: Document with normal heading (no invalid characters)
const parserOutput = {
Expand Down
Loading