training_data.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436
  1. #!/usr/bin/env python3
  2. """
  3. AceFlow v2.0 AI训练数据生成器
  4. 为任务分类和流程推荐模型提供训练数据
  5. """
  6. import json
  7. import random
  8. from datetime import datetime, timedelta
  9. from typing import List, Dict, Tuple, Any
  10. from pathlib import Path
  11. from dataclasses import dataclass, asdict
  12. from enum import Enum
  13. # 导入决策引擎的枚举类型
  14. import sys
  15. sys.path.append(str(Path(__file__).parent.parent))
  16. from engines.decision_engine import TaskType, ProjectComplexity, TaskContext, ProjectContext
  17. class TrainingDataGenerator:
  18. """训练数据生成器"""
  19. def __init__(self):
  20. self.task_keywords = {
  21. TaskType.FEATURE_DEVELOPMENT: [
  22. "add new feature", "implement functionality", "create component",
  23. "build module", "develop interface", "add capability", "新功能",
  24. "实现功能", "开发组件", "创建模块", "构建接口"
  25. ],
  26. TaskType.BUG_FIX: [
  27. "fix bug", "resolve issue", "correct error", "patch problem",
  28. "debug issue", "solve crash", "修复bug", "解决问题", "修正错误",
  29. "调试问题", "解决崩溃"
  30. ],
  31. TaskType.REFACTORING: [
  32. "refactor code", "improve structure", "optimize performance",
  33. "clean up code", "restructure", "重构代码", "优化结构",
  34. "性能优化", "代码整理", "重新架构"
  35. ],
  36. TaskType.TESTING: [
  37. "write tests", "add unit tests", "create test cases",
  38. "test functionality", "write integration tests", "编写测试",
  39. "单元测试", "集成测试", "测试用例", "功能测试"
  40. ],
  41. TaskType.DOCUMENTATION: [
  42. "update documentation", "write docs", "create guide",
  43. "document API", "write README", "更新文档", "编写文档",
  44. "创建指南", "API文档", "说明文档"
  45. ],
  46. TaskType.RESEARCH: [
  47. "research technology", "investigate solution", "explore options",
  48. "study framework", "analyze tools", "技术调研", "方案研究",
  49. "技术探索", "框架分析", "工具研究"
  50. ],
  51. TaskType.ARCHITECTURE: [
  52. "design system", "plan architecture", "create blueprint",
  53. "define structure", "design patterns", "系统设计", "架构设计",
  54. "架构规划", "设计模式", "系统架构"
  55. ],
  56. TaskType.DEPLOYMENT: [
  57. "deploy application", "release version", "publish build",
  58. "setup environment", "configure deployment", "部署应用",
  59. "发布版本", "环境配置", "部署配置", "上线部署"
  60. ]
  61. }
  62. self.project_types = ["web", "mobile", "api", "desktop", "game", "ai", "blockchain"]
  63. self.tech_stacks = [
  64. ["Python", "Django", "PostgreSQL"],
  65. ["JavaScript", "React", "Node.js"],
  66. ["Java", "Spring", "MySQL"],
  67. ["C#", ".NET", "SQL Server"],
  68. ["Go", "Gin", "Redis"],
  69. ["TypeScript", "Vue.js", "MongoDB"],
  70. ["Swift", "iOS", "CoreData"],
  71. ["Kotlin", "Android", "SQLite"]
  72. ]
  73. self.priorities = ["high", "medium", "low"]
  74. self.complexities = ["high", "medium", "low"]
  75. self.impacts = ["high", "medium", "low"]
  76. def generate_task_description(self, task_type: TaskType) -> str:
  77. """生成任务描述"""
  78. keywords = self.task_keywords[task_type]
  79. base_keyword = random.choice(keywords)
  80. # 添加上下文信息
  81. contexts = [
  82. "for user dashboard",
  83. "in payment module",
  84. "for mobile app",
  85. "in admin panel",
  86. "for API endpoint",
  87. "in authentication system",
  88. "for data visualization",
  89. "在用户管理模块",
  90. "在支付系统中",
  91. "在移动端应用",
  92. "在后台管理"
  93. ]
  94. context = random.choice(contexts)
  95. # 生成完整描述
  96. if random.random() < 0.5:
  97. return f"{base_keyword} {context}"
  98. else:
  99. return f"{base_keyword} to improve {context}"
  100. def generate_task_context(self, task_type: TaskType) -> TaskContext:
  101. """生成任务上下文"""
  102. description = self.generate_task_description(task_type)
  103. priority = random.choice(self.priorities)
  104. complexity = random.choice(self.complexities)
  105. impact = random.choice(self.impacts)
  106. # 生成依赖关系
  107. dependencies = []
  108. if random.random() < 0.3: # 30%概率有依赖
  109. dep_count = random.randint(1, 3)
  110. for i in range(dep_count):
  111. dependencies.append(f"dependency_{i+1}")
  112. # 生成估算时间
  113. effort_options = ["1-2 hours", "half day", "1-2 days", "3-5 days", "1 week", "2 weeks"]
  114. effort = random.choice(effort_options)
  115. return TaskContext(
  116. description=description,
  117. priority=priority,
  118. estimated_effort=effort,
  119. dependencies=dependencies,
  120. technical_complexity=complexity,
  121. user_impact=impact
  122. )
  123. def generate_project_context(self, preferred_flow: str = None) -> ProjectContext:
  124. """生成项目上下文"""
  125. project_type = random.choice(self.project_types)
  126. tech_stack = random.choice(self.tech_stacks)
  127. # 根据流程类型调整项目特征
  128. if preferred_flow == "minimal":
  129. team_size = random.randint(1, 3)
  130. duration = random.choice(["1 week", "2 weeks", "1 month"])
  131. risk_count = random.randint(0, 2)
  132. elif preferred_flow == "complete":
  133. team_size = random.randint(8, 15)
  134. duration = random.choice(["3 months", "6 months", "1 year"])
  135. risk_count = random.randint(3, 6)
  136. else: # standard
  137. team_size = random.randint(3, 8)
  138. duration = random.choice(["1 month", "2 months", "3 months"])
  139. risk_count = random.randint(1, 3)
  140. # 生成风险因素
  141. risk_factors = []
  142. risk_options = [
  143. "tight deadline", "new technology", "complex requirements",
  144. "limited resources", "high complexity", "uncertain scope",
  145. "integration challenges", "performance requirements"
  146. ]
  147. for _ in range(risk_count):
  148. risk_factors.append(random.choice(risk_options))
  149. # 生成其他属性
  150. stages = ["planning", "development", "testing", "deployment"]
  151. current_stage = random.choice(stages)
  152. completion = random.uniform(0.1, 0.9)
  153. velocity = random.uniform(0.6, 1.2)
  154. return ProjectContext(
  155. name=f"{project_type}-project-{random.randint(1000, 9999)}",
  156. team_size=team_size,
  157. duration_estimate=duration,
  158. technology_stack=tech_stack,
  159. project_type=project_type,
  160. current_stage=current_stage,
  161. completion_percentage=completion,
  162. historical_velocity=velocity,
  163. risk_factors=risk_factors
  164. )
  165. def generate_task_classification_data(self, samples_per_type: int = 50) -> List[Tuple[str, TaskType]]:
  166. """生成任务分类训练数据"""
  167. training_data = []
  168. for task_type in TaskType:
  169. for _ in range(samples_per_type):
  170. task_context = self.generate_task_context(task_type)
  171. training_data.append((task_context.description, task_type))
  172. # 打乱数据顺序
  173. random.shuffle(training_data)
  174. return training_data
  175. def generate_flow_recommendation_data(self, samples_per_flow: int = 100) -> List[Tuple[Dict[str, Any], str]]:
  176. """生成流程推荐训练数据"""
  177. training_data = []
  178. flows = ["minimal", "standard", "complete"]
  179. for flow in flows:
  180. for _ in range(samples_per_flow):
  181. # 生成适合该流程的项目上下文
  182. project_context = self.generate_project_context(flow)
  183. features = project_context.to_features()
  184. training_data.append((features, flow))
  185. # 打乱数据顺序
  186. random.shuffle(training_data)
  187. return training_data
  188. def generate_progress_prediction_data(self, samples: int = 500) -> List[Tuple[Dict[str, Any], int]]:
  189. """生成进度预测训练数据"""
  190. training_data = []
  191. for _ in range(samples):
  192. task_type = random.choice(list(TaskType))
  193. task_context = self.generate_task_context(task_type)
  194. project_context = self.generate_project_context()
  195. # 基于规则生成"真实"持续时间
  196. base_hours = {
  197. TaskType.BUG_FIX: 4,
  198. TaskType.FEATURE_DEVELOPMENT: 16,
  199. TaskType.REFACTORING: 8,
  200. TaskType.TESTING: 6,
  201. TaskType.DOCUMENTATION: 4,
  202. TaskType.RESEARCH: 12,
  203. TaskType.ARCHITECTURE: 24,
  204. TaskType.DEPLOYMENT: 8
  205. }
  206. hours = base_hours.get(task_type, 8)
  207. # 添加随机变化和项目因素影响
  208. if task_context.technical_complexity == "high":
  209. hours *= random.uniform(1.3, 1.8)
  210. elif task_context.technical_complexity == "low":
  211. hours *= random.uniform(0.7, 0.9)
  212. if project_context.team_size > 5:
  213. hours *= random.uniform(1.1, 1.4)
  214. elif project_context.team_size == 1:
  215. hours *= random.uniform(0.8, 1.0)
  216. if len(project_context.risk_factors) > 2:
  217. hours *= random.uniform(1.2, 1.6)
  218. # 添加随机噪声
  219. hours *= random.uniform(0.8, 1.3)
  220. # 合并特征
  221. features = {
  222. **task_context.to_features(),
  223. **project_context.to_features(),
  224. 'task_type_feature': task_type.value
  225. }
  226. training_data.append((features, int(hours)))
  227. return training_data
  228. def save_training_data(self, output_dir: Path):
  229. """保存训练数据到文件"""
  230. output_dir.mkdir(parents=True, exist_ok=True)
  231. # 生成任务分类数据
  232. print("🔄 生成任务分类训练数据...")
  233. task_data = self.generate_task_classification_data(samples_per_type=100)
  234. # 转换为可序列化格式
  235. task_data_serializable = [
  236. (description, task_type.value) for description, task_type in task_data
  237. ]
  238. with open(output_dir / "task_classification_data.json", 'w', encoding='utf-8') as f:
  239. json.dump(task_data_serializable, f, indent=2, ensure_ascii=False)
  240. print(f"✅ 任务分类数据已保存: {len(task_data)} 条记录")
  241. # 生成流程推荐数据
  242. print("🔄 生成流程推荐训练数据...")
  243. flow_data = self.generate_flow_recommendation_data(samples_per_flow=150)
  244. with open(output_dir / "flow_recommendation_data.json", 'w', encoding='utf-8') as f:
  245. json.dump(flow_data, f, indent=2, ensure_ascii=False)
  246. print(f"✅ 流程推荐数据已保存: {len(flow_data)} 条记录")
  247. # 生成进度预测数据
  248. print("🔄 生成进度预测训练数据...")
  249. progress_data = self.generate_progress_prediction_data(samples=800)
  250. with open(output_dir / "progress_prediction_data.json", 'w', encoding='utf-8') as f:
  251. json.dump(progress_data, f, indent=2, ensure_ascii=False)
  252. print(f"✅ 进度预测数据已保存: {len(progress_data)} 条记录")
  253. # 生成数据集统计信息
  254. stats = {
  255. "generation_time": datetime.now().isoformat(),
  256. "total_samples": len(task_data) + len(flow_data) + len(progress_data),
  257. "task_classification_samples": len(task_data),
  258. "flow_recommendation_samples": len(flow_data),
  259. "progress_prediction_samples": len(progress_data),
  260. "task_types": [t.value for t in TaskType],
  261. "flow_types": ["minimal", "standard", "complete"],
  262. "feature_dimensions": {
  263. "task_features": len(self.generate_task_context(TaskType.FEATURE_DEVELOPMENT).to_features()),
  264. "project_features": len(self.generate_project_context().to_features())
  265. }
  266. }
  267. with open(output_dir / "dataset_stats.json", 'w', encoding='utf-8') as f:
  268. json.dump(stats, f, indent=2, ensure_ascii=False)
  269. print(f"📊 数据集统计信息已保存")
  270. print(f"📁 所有训练数据保存在: {output_dir}")
  271. class RealWorldDataCollector:
  272. """真实项目数据收集器"""
  273. def __init__(self, aceflow_dir: Path):
  274. self.aceflow_dir = aceflow_dir
  275. self.data_dir = aceflow_dir / "ai" / "data"
  276. self.real_data_file = self.data_dir / "real_world_data.jsonl"
  277. def collect_project_data(self) -> Dict[str, Any]:
  278. """收集当前项目的真实数据"""
  279. try:
  280. # 读取项目状态
  281. state_file = self.aceflow_dir / "state" / "project_state.json"
  282. if state_file.exists():
  283. with open(state_file, 'r', encoding='utf-8') as f:
  284. state = json.load(f)
  285. else:
  286. state = {}
  287. # 读取配置
  288. config_file = self.aceflow_dir / "config.yaml"
  289. if config_file.exists():
  290. import yaml
  291. with open(config_file, 'r', encoding='utf-8') as f:
  292. config = yaml.safe_load(f)
  293. else:
  294. config = {}
  295. # 收集项目特征
  296. project_data = {
  297. "timestamp": datetime.now().isoformat(),
  298. "project_id": state.get("project_id", "unknown"),
  299. "flow_mode": state.get("flow_mode", "unknown"),
  300. "current_stage": state.get("current_stage", "unknown"),
  301. "progress": state.get("progress", {}),
  302. "team_size": config.get("project", {}).get("team_size", "unknown"),
  303. "project_type": config.get("project", {}).get("project_type", "unknown"),
  304. "agile_framework": config.get("agile", {}).get("framework", "unknown"),
  305. "stage_history": state.get("stage_states", {})
  306. }
  307. return project_data
  308. except Exception as e:
  309. print(f"⚠️ 收集项目数据时出错: {e}")
  310. return {}
  311. def save_real_data(self, data: Dict[str, Any]):
  312. """保存真实数据到文件"""
  313. try:
  314. self.data_dir.mkdir(parents=True, exist_ok=True)
  315. with open(self.real_data_file, 'a', encoding='utf-8') as f:
  316. f.write(json.dumps(data, ensure_ascii=False) + '\n')
  317. print(f"✅ 真实数据已保存: {self.real_data_file}")
  318. except Exception as e:
  319. print(f"❌ 保存真实数据时出错: {e}")
  320. def get_collected_data(self) -> List[Dict[str, Any]]:
  321. """获取所有收集的真实数据"""
  322. data = []
  323. if self.real_data_file.exists():
  324. try:
  325. with open(self.real_data_file, 'r', encoding='utf-8') as f:
  326. for line in f:
  327. if line.strip():
  328. data.append(json.loads(line.strip()))
  329. except Exception as e:
  330. print(f"⚠️ 读取真实数据时出错: {e}")
  331. return data
  332. def main():
  333. """主函数"""
  334. print("🤖 AceFlow v2.0 AI训练数据生成器")
  335. print("=" * 50)
  336. # 设置随机种子以确保可重现性
  337. random.seed(42)
  338. # 确定输出目录
  339. current_dir = Path(__file__).parent
  340. output_dir = current_dir / "training_datasets"
  341. # 生成训练数据
  342. generator = TrainingDataGenerator()
  343. generator.save_training_data(output_dir)
  344. # 收集真实项目数据
  345. aceflow_dir = current_dir.parent.parent
  346. collector = RealWorldDataCollector(aceflow_dir)
  347. real_data = collector.collect_project_data()
  348. if real_data:
  349. collector.save_real_data(real_data)
  350. print("\n🎉 训练数据生成完成!")
  351. print("📋 数据集包含:")
  352. print(" - 任务分类数据: 800条记录")
  353. print(" - 流程推荐数据: 450条记录")
  354. print(" - 进度预测数据: 800条记录")
  355. print(" - 真实项目数据: 持续收集中")
  356. print(f"\n💾 数据保存位置: {output_dir}")
  357. if __name__ == "__main__":
  358. main()