fix(indexing): normalize extension casing and tighten docs/test (#77)

Address CodeRabbit review:
- getLanguageFromExtension/getAstGrepLang lowercase the extension before
  consulting the override map, so an uppercase ext still matches the
  lowercased keys (production callers already lowercase; .R collapses to
  .r with the same value, so no other behavior changes).
- README: clarify EXTRA_EXTENSIONS indexes as plaintext / leaf nodes,
  versus EXTENSION_LANGUAGE_MAP's full language treatment.
- Make the isIndexableFile override test hermetic (save/restore the prior
  map entry, assert against the captured baseline).
This commit is contained in:
Giancarlo Erra
2026-06-28 20:59:25 +01:00
parent 1c5ddf52be
commit c2c616f4a3
5 changed files with 24 additions and 8 deletions
+1 -1
View File
@@ -255,7 +255,7 @@ On VS Code's 2.45Mline codebase, SocratiCode answers architectural questions
- **Cross-project search** — Search across multiple related projects in a single query. Link projects via `.socraticode.json` or the `SOCRATICODE_LINKED_PROJECTS` env var, then set `includeLinked: true` on `codebase_search`. Results are tagged with project labels and deduplicated via client-side RRF fusion.
- **Branch-aware indexing** — Maintain separate indexes per git branch by setting `SOCRATICODE_BRANCH_AWARE=true`. Each branch gets its own Qdrant collections, so switching branches instantly switches to the correct index. Ideal for CI/CD pipelines and PR review workflows.
- **Respects ignore rules** — Honors all `.gitignore` files (root + nested), plus an optional `.socraticodeignore` for additional exclusions. Includes sensible built-in defaults. `.gitignore` processing can be disabled via `RESPECT_GITIGNORE=false`. Dot-directories (e.g. `.agent`) can be included via `INCLUDE_DOT_FILES=true`.
- **Custom file extensions** — Projects with non-standard extensions (e.g. `.tpl`, `.blade`) can be included via `EXTRA_EXTENSIONS` env var or `extraExtensions` tool parameter. Works for both indexing and code graph. To go further and treat a custom extension as a real language (full AST chunking, symbols, call graph) rather than plaintext, map it with `EXTENSION_LANGUAGE_MAP` (e.g. `.inc:php`).
- **Custom file extensions** — Projects with non-standard extensions (e.g. `.tpl`, `.blade`) can be included via `EXTRA_EXTENSIONS` env var or `extraExtensions` tool parameter. Such files are indexed as plaintext and appear as leaf nodes in the code graph (no AST chunking or symbols). To instead treat a custom extension as a real language (full AST chunking, symbols, call graph), map it with `EXTENSION_LANGUAGE_MAP` (e.g. `.inc:php`).
- **Configurable infrastructure** — All ports, hosts, and API keys are configurable via environment variables. Qdrant API key support for enterprise deployments.
- **Enterprise-ready simplicity** — No agent coordination tuning, no memory limit environment variables, no coordinator/conductor capacity knobs, no backpressure configuration. SocratiCode scales by relying on production-grade infrastructure (Qdrant, proven embedding APIs) rather than complex in-process orchestration.
- **Auto-setup & zero configuration** — Just install the Claude Plugin/Skill or add the MCP server to your AI host config. On first use, the server automatically checks Docker, pulls images, starts Qdrant and Ollama containers, and downloads the embedding model. No config files, no YAML, no environment variables to tune, no native dependencies to compile. Works everywhere Docker runs.
+5 -1
View File
@@ -390,6 +390,10 @@ export function getLanguageFromExtension(
ext: string,
override: Map<string, string> = EXTENSION_LANGUAGE_MAP,
): string {
const target = override.get(ext) ?? ext;
// Normalize so a caller passing an uppercase ext still matches the
// (lowercased) override keys; the only case-sensitive built-in key, `.R`,
// collapses to `.r` with the same value, so this changes nothing else.
const normalized = ext.toLowerCase();
const target = override.get(normalized) ?? normalized;
return EXTENSION_TO_LANGUAGE[target] || "plaintext";
}
+4 -1
View File
@@ -582,7 +582,10 @@ export function getAstGrepLang(
ext: string,
override: Map<string, string> = EXTENSION_LANGUAGE_MAP,
): Lang | string | null {
const target = override.get(ext) ?? ext;
// Match getLanguageFromExtension: normalize casing so override lookups (keys
// are stored lowercased) and the grammar stay aligned with the label.
const normalized = ext.toLowerCase();
const target = override.get(normalized) ?? normalized;
return EXTENSION_TO_AST_GREP_LANG[target] ?? null;
}
+2
View File
@@ -323,6 +323,8 @@ describe("constants", () => {
it("resolves an EXTENSION_LANGUAGE_MAP override to the target language label", () => {
const override = new Map([[".inc", ".php"]]);
expect(getLanguageFromExtension(".inc", override)).toBe("php");
// Case-insensitive: an uppercase ext still matches the lowercased key.
expect(getLanguageFromExtension(".INC", override)).toBe("php");
// Unmapped extensions are unaffected by the override.
expect(getLanguageFromExtension(".xyz", override)).toBe("plaintext");
});
+12 -5
View File
@@ -128,16 +128,21 @@ describe("indexer utilities", () => {
});
it("accepts extensions registered via EXTENSION_LANGUAGE_MAP", () => {
// isIndexableFile reads the global override map; mutate it directly
// (it is the same Map the function consults) and clean up after.
// isIndexableFile reads the global override map; mutate it directly (it is
// the same Map the function consults) and restore the prior state after,
// so the test stays hermetic even if `.inc` is ever pre-configured.
const baseline = isIndexableFile("foo.class.inc");
const hadPrevious = EXTENSION_LANGUAGE_MAP.has(".inc");
const previous = EXTENSION_LANGUAGE_MAP.get(".inc");
EXTENSION_LANGUAGE_MAP.set(".inc", ".php");
try {
expect(isIndexableFile("foo.class.inc")).toBe(true);
} finally {
EXTENSION_LANGUAGE_MAP.delete(".inc");
if (hadPrevious) EXTENSION_LANGUAGE_MAP.set(".inc", previous as string);
else EXTENSION_LANGUAGE_MAP.delete(".inc");
}
// Once removed, the extension is no longer indexable.
expect(isIndexableFile("foo.class.inc")).toBe(false);
// Restored to the pre-test baseline.
expect(isIndexableFile("foo.class.inc")).toBe(baseline);
});
it("accepts .cfg and .ini extensions", () => {
@@ -158,6 +163,8 @@ describe("indexer utilities", () => {
it("resolves a mapped extension to the target language's grammar", () => {
const override = new Map([[".inc", ".php"]]);
expect(getAstGrepLang(".inc", override)).toBe("php");
// Case-insensitive, matching getLanguageFromExtension.
expect(getAstGrepLang(".INC", override)).toBe("php");
// The vocabulary subtlety: a Lang-enum language resolves correctly too.
const tsOverride = new Map([[".component", ".ts"]]);
expect(String(getAstGrepLang(".component", tsOverride))).toBe("TypeScript");