-
Notifications
You must be signed in to change notification settings - Fork 1.2k
Expand file tree
/
Copy pathextract.ts
More file actions
570 lines (495 loc) · 16 KB
/
extract.ts
File metadata and controls
570 lines (495 loc) · 16 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
/**
* Extracts code blocks from markdown documentation files.
* Outputs individual files for validation by language-specific tools.
*/
import * as fs from "fs";
import * as path from "path";
import { glob } from "glob";
const DOCS_DIR = path.resolve(import.meta.dirname, "../../docs");
const OUTPUT_DIR = path.resolve(import.meta.dirname, "../../docs/.validation");
// Map markdown language tags to our canonical names
const LANGUAGE_MAP: Record<string, string> = {
typescript: "typescript",
ts: "typescript",
javascript: "typescript", // Treat JS as TS for validation
js: "typescript",
python: "python",
py: "python",
go: "go",
golang: "go",
csharp: "csharp",
"c#": "csharp",
cs: "csharp",
java: "java",
};
interface CodeBlock {
language: string;
code: string;
file: string;
line: number;
skip: boolean;
hidden: boolean;
wrapAsync: boolean;
}
interface ExtractionManifest {
extractedAt: string;
blocks: {
id: string;
sourceFile: string;
sourceLine: number;
language: string;
outputFile: string;
}[];
}
function parseMarkdownCodeBlocks(
content: string,
filePath: string
): CodeBlock[] {
const blocks: CodeBlock[] = [];
const lines = content.split("\n");
let inCodeBlock = false;
let currentLang = "";
let currentCode: string[] = [];
let blockStartLine = 0;
let skipNext = false;
let wrapAsync = false;
let inHiddenBlock = false;
for (let i = 0; i < lines.length; i++) {
const line = lines[i];
// Check for validation directives
if (line.includes("<!-- docs-validate: skip -->")) {
skipNext = true;
continue;
}
if (line.includes("<!-- docs-validate: wrap-async -->")) {
wrapAsync = true;
continue;
}
if (line.includes("<!-- docs-validate: hidden -->")) {
inHiddenBlock = true;
continue;
}
if (line.includes("<!-- /docs-validate: hidden -->")) {
inHiddenBlock = false;
// Skip the next visible code block since the hidden one replaces it
skipNext = true;
continue;
}
// Start of code block
if (!inCodeBlock && line.startsWith("```")) {
const lang = line.slice(3).trim().toLowerCase();
if (lang && LANGUAGE_MAP[lang]) {
inCodeBlock = true;
currentLang = LANGUAGE_MAP[lang];
currentCode = [];
blockStartLine = i + 1; // 1-indexed line number
}
continue;
}
// End of code block
if (inCodeBlock && line.startsWith("```")) {
blocks.push({
language: currentLang,
code: currentCode.join("\n"),
file: filePath,
line: blockStartLine,
skip: skipNext,
hidden: inHiddenBlock,
wrapAsync: wrapAsync,
});
inCodeBlock = false;
currentLang = "";
currentCode = [];
// Only reset skipNext when NOT in a hidden block — hidden blocks
// can contain multiple code fences that all get validated.
if (!inHiddenBlock) {
skipNext = false;
}
wrapAsync = false;
continue;
}
// Inside code block
if (inCodeBlock) {
currentCode.push(line);
}
}
return blocks;
}
function generateFileName(
block: CodeBlock,
index: number,
langCounts: Map<string, number>
): string {
const count = langCounts.get(block.language) || 0;
langCounts.set(block.language, count + 1);
const sourceBasename = path.basename(block.file, ".md");
const ext = getExtension(block.language);
return `${sourceBasename}_${count}${ext}`;
}
function getExtension(language: string): string {
switch (language) {
case "typescript":
return ".ts";
case "python":
return ".py";
case "go":
return ".go";
case "csharp":
return ".cs";
case "java":
return ".java";
default:
return ".txt";
}
}
/**
* Detect code fragments that can't be validated as standalone files.
* These are typically partial snippets showing configuration options
* or code that's meant to be part of a larger context.
*/
function shouldSkipFragment(block: CodeBlock): boolean {
const code = block.code.trim();
// TypeScript/JavaScript: Skip bare object literals (config snippets)
if (block.language === "typescript") {
// Starts with property: value pattern (e.g., "provider: {")
if (/^[a-zA-Z_]+\s*:\s*[\{\[]/.test(code)) {
return true;
}
// Starts with just an object/array that's not assigned
if (/^\{[\s\S]*\}$/.test(code) && !code.includes("import ") && !code.includes("export ")) {
return true;
}
}
// Go: Skip fragments that are just type definitions without package
if (block.language === "go") {
// Function signatures without bodies (interface definitions shown in docs)
if (/^func\s+\w+\([^)]*\)\s*\([^)]*\)\s*$/.test(code)) {
return true;
}
}
// Java: Skip interface definitions, annotations-only, or method signatures without bodies
if (block.language === "java") {
// Just an annotation
if (/^@\w+/.test(code) && !code.includes("{")) {
return true;
}
// Method signature without body
if (/^(public|private|protected)?\s*(static\s+)?[\w<>\[\]]+\s+\w+\([^)]*\)\s*(throws\s+[\w,\s]+)?;\s*$/.test(code)) {
return true;
}
}
return false;
}
function wrapCodeForValidation(block: CodeBlock): string {
let code = block.code;
// Python: auto-detect async code and wrap if needed
if (block.language === "python") {
const hasAwait = /\bawait\b/.test(code);
const hasAsyncDef = /\basync\s+def\b/.test(code);
// Check if await is used outside of any async def
// Simple heuristic: if await appears at column 0 or after assignment at column 0
const lines = code.split("\n");
let awaitOutsideFunction = false;
let inAsyncFunction = false;
let indentLevel = 0;
for (const line of lines) {
const trimmed = line.trimStart();
const leadingSpaces = line.length - trimmed.length;
// Track if we're in an async function
if (trimmed.startsWith("async def ")) {
inAsyncFunction = true;
indentLevel = leadingSpaces;
} else if (inAsyncFunction && leadingSpaces <= indentLevel && trimmed && !trimmed.startsWith("#")) {
// Dedented back, we're out of the function
inAsyncFunction = false;
}
// Check for await outside function
if (trimmed.includes("await ") && !inAsyncFunction) {
awaitOutsideFunction = true;
break;
}
}
const needsWrap = block.wrapAsync || awaitOutsideFunction || (hasAwait && !hasAsyncDef);
if (needsWrap) {
const indented = code
.split("\n")
.map((l) => " " + l)
.join("\n");
code = `import asyncio\n\nasync def main():\n${indented}\n\nasyncio.run(main())`;
}
}
// Go: ensure package declaration
if (block.language === "go" && !code.includes("package ")) {
code = `package main\n\n${code}`;
}
// Go: add main function if missing and has statements outside functions
if (block.language === "go" && !code.includes("func main()")) {
// Check if code has statements that need to be in main
const hasStatements = /^[a-z]/.test(code.trim().split("\n").pop() || "");
if (hasStatements) {
// This is a snippet, wrap it
const lines = code.split("\n");
const packageLine = lines.find((l) => l.startsWith("package ")) || "";
const imports = lines.filter(
(l) => l.startsWith("import ") || l.startsWith('import (')
);
const rest = lines.filter(
(l) =>
!l.startsWith("package ") &&
!l.startsWith("import ") &&
!l.startsWith("import (") &&
!l.startsWith(")") &&
!l.startsWith("\t") // import block lines
);
// Only wrap if there are loose statements (not type/func definitions)
const hasLooseStatements = rest.some(
(l) =>
l.trim() &&
!l.startsWith("type ") &&
!l.startsWith("func ") &&
!l.startsWith("//") &&
!l.startsWith("var ") &&
!l.startsWith("const ")
);
if (!hasLooseStatements) {
// Code has proper structure, just ensure it has a main
code = code + "\n\nfunc main() {}";
}
}
}
// C#: wrap in a class to avoid top-level statements conflicts
// (C# only allows one file with top-level statements per project)
if (block.language === "csharp") {
// Check if it's a complete file (has namespace or class)
const hasStructure =
code.includes("namespace ") ||
code.includes("class ") ||
code.includes("record ") ||
code.includes("public delegate ");
if (!hasStructure) {
// Extract any existing using statements
const lines = code.split("\n");
const usings: string[] = [];
const rest: string[] = [];
for (const line of lines) {
if (line.trim().startsWith("using ") && line.trim().endsWith(";")) {
usings.push(line);
} else {
rest.push(line);
}
}
// Always ensure SDK usings are present. If the snippet already
// declares any GitHub.Copilot using, assume the author curated
// them and don't add others (avoids name ambiguities like
// ModelCapabilities living in both namespaces).
const hasAnyCopilotUsing = usings.some(u =>
u.includes("GitHub.Copilot;") || u.includes("GitHub.Copilot."),
);
if (!hasAnyCopilotUsing) {
usings.push("using GitHub.Copilot;");
usings.push("using GitHub.Copilot.Rpc;");
}
// Generate a unique class name based on block location
const className = `ValidationClass_${block.file.replace(/[^a-zA-Z0-9]/g, "_")}_${block.line}`;
// Wrap in async method to support await
const hasAwait = code.includes("await ");
const indentedCode = rest.map(l => " " + l).join("\n");
if (hasAwait) {
code = `${usings.join("\n")}
public static class ${className}
{
public static async Task Main()
{
${indentedCode}
}
}`;
} else {
code = `${usings.join("\n")}
public static class ${className}
{
public static void Main()
{
${indentedCode}
}
}`;
}
} else {
// Has structure. Only add SDK usings if neither namespace is present;
// if the snippet declares its own using GitHub.Copilot statement,
// assume the author curated imports (avoids ambiguities like
// ModelCapabilities living in both namespaces).
if (!code.includes("using GitHub.Copilot")) {
code = "using GitHub.Copilot;\nusing GitHub.Copilot.Rpc;\n" + code;
}
}
}
// Java: wrap in a class for compilation
if (block.language === "java") {
const hasClass =
code.includes("class ") ||
code.includes("interface ") ||
code.includes("enum ");
if (!hasClass) {
// Extract any existing import statements
const lines = code.split("\n");
const imports: string[] = [];
const rest: string[] = [];
for (const line of lines) {
if (line.trim().startsWith("import ")) {
imports.push(line);
} else {
rest.push(line);
}
}
// Add default imports if no SDK imports are present
const hasAnyCopilotImport = imports.some(i =>
i.includes("com.github.copilot"),
);
if (!hasAnyCopilotImport) {
imports.push("import com.github.copilot.*;");
imports.push("import com.github.copilot.rpc.*;");
imports.push("import java.util.*;");
imports.push("import java.util.concurrent.*;");
}
// Generate a unique class name from block.file and block.line
let className = `${block.file.replace(/[^a-zA-Z0-9]/g, "_")}_${block.line}`;
if (/^\d/.test(className)) {
className = "Snippet_" + className;
}
const indentedCode = rest.map(l => " " + l).join("\n");
code = `${imports.join("\n")}
public class ${className} {
public static void main(String[] args) throws Exception {
${indentedCode}
}
}`;
} else {
// Has class structure. Only add SDK imports if not already present.
if (!code.includes("import com.github.copilot")) {
code = "import com.github.copilot.*;\nimport com.github.copilot.rpc.*;\nimport java.util.*;\nimport java.util.concurrent.*;\n" + code;
}
}
}
return code;
}
async function main() {
console.log("📖 Extracting code blocks from documentation...\n");
// Clean output directory
if (fs.existsSync(OUTPUT_DIR)) {
fs.rmSync(OUTPUT_DIR, { recursive: true });
}
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
// Create language subdirectories
for (const lang of ["typescript", "python", "go", "csharp", "java"]) {
fs.mkdirSync(path.join(OUTPUT_DIR, lang), { recursive: true });
}
// Find all markdown files
const mdFiles = await glob("**/*.md", {
cwd: DOCS_DIR,
ignore: [".validation/**", "node_modules/**", "IMPROVEMENT_PLAN.md"],
});
console.log(`Found ${mdFiles.length} markdown files\n`);
const manifest: ExtractionManifest = {
extractedAt: new Date().toISOString(),
blocks: [],
};
const langCounts = new Map<string, number>();
let totalBlocks = 0;
let skippedBlocks = 0;
let hiddenBlocks = 0;
for (const mdFile of mdFiles) {
const fullPath = path.join(DOCS_DIR, mdFile);
const content = fs.readFileSync(fullPath, "utf-8");
const blocks = parseMarkdownCodeBlocks(content, mdFile);
for (const block of blocks) {
if (block.skip) {
skippedBlocks++;
continue;
}
if (block.hidden) {
hiddenBlocks++;
}
// Skip empty or trivial blocks
if (block.code.trim().length < 10) {
continue;
}
// Skip incomplete code fragments that can't be validated standalone
if (shouldSkipFragment(block)) {
skippedBlocks++;
continue;
}
const fileName = generateFileName(block, totalBlocks, langCounts);
const wrappedCode = wrapCodeForValidation(block);
// For Java, filename must match the public class name
let actualFileName = fileName;
if (block.language === "java") {
const classMatch = wrappedCode.match(/public class (\w+)/);
if (classMatch) {
actualFileName = classMatch[1] + ".java";
}
}
const outputPath = path.join(OUTPUT_DIR, block.language, actualFileName);
// Add source location comment
const sourceComment = getSourceComment(
block.language,
block.file,
block.line
);
const finalCode = sourceComment + "\n" + wrappedCode;
fs.writeFileSync(outputPath, finalCode);
manifest.blocks.push({
id: `${block.language}/${actualFileName}`,
sourceFile: block.file,
sourceLine: block.line,
language: block.language,
outputFile: `${block.language}/${actualFileName}`,
});
totalBlocks++;
}
}
// Write manifest
fs.writeFileSync(
path.join(OUTPUT_DIR, "manifest.json"),
JSON.stringify(manifest, null, 2)
);
// Summary
console.log("Extraction complete!\n");
console.log(" Language Count");
console.log(" ─────────────────────");
for (const [lang, count] of langCounts) {
console.log(` ${lang.padEnd(14)} ${count}`);
}
console.log(" ─────────────────────");
console.log(` Total ${totalBlocks}`);
if (skippedBlocks > 0) {
console.log(` Skipped ${skippedBlocks}`);
}
if (hiddenBlocks > 0) {
console.log(` Hidden ${hiddenBlocks}`);
}
console.log(`\nOutput: ${OUTPUT_DIR}`);
}
function getSourceComment(
language: string,
file: string,
line: number
): string {
// Normalize path separators to forward slashes to avoid issues
// (e.g., Java interprets \u as a unicode escape sequence)
const normalizedFile = file.replace(/\\/g, "/");
const location = `Source: ${normalizedFile}:${line}`;
switch (language) {
case "typescript":
case "go":
case "csharp":
return `// ${location}`;
case "python":
return `# ${location}`;
default:
return `// ${location}`;
}
}
main().catch((err) => {
console.error("Extraction failed:", err);
process.exit(1);
});