merge_database_simple.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. 数据库源代码软著申请专用拼接脚本 (Python版本)
  5. 功能:将所有数据库相关文件完整拼接成单一文档,专用于软件著作权申请材料
  6. 特点:
  7. - 支持SQL文件、建表语句、存储过程等
  8. - 智能识别数据库文件类型
  9. - 保持SQL代码格式完整性
  10. - 跨平台兼容(Windows/Linux/macOS)
  11. """
  12. import os
  13. import sys
  14. import json
  15. from pathlib import Path
  16. from datetime import datetime
  17. from typing import List, Optional, Dict
  18. # 颜色输出类
  19. class Colors:
  20. RED = '\033[0;31m'
  21. GREEN = '\033[0;32m'
  22. YELLOW = '\033[1;33m'
  23. BLUE = '\033[0;34m'
  24. NC = '\033[0m' # No Color
  25. def print_success(message: str):
  26. print(f"{Colors.GREEN}✓ {message}{Colors.NC}")
  27. def print_info(message: str):
  28. print(f"{Colors.BLUE}ℹ {message}{Colors.NC}")
  29. def print_warning(message: str):
  30. print(f"{Colors.YELLOW}⚠ {message}{Colors.NC}")
  31. def print_error(message: str):
  32. print(f"{Colors.RED}✗ {message}{Colors.NC}")
  33. def get_project_config() -> Optional[dict]:
  34. """读取项目配置文件"""
  35. config_file = Path("ai-copyright-config.json")
  36. if not config_file.exists():
  37. print_error("配置文件不存在: ai-copyright-config.json")
  38. return None
  39. try:
  40. with open(config_file, 'r', encoding='utf-8') as f:
  41. return json.load(f)
  42. except json.JSONDecodeError as e:
  43. print_error(f"配置文件JSON格式错误: {e}")
  44. return None
  45. except Exception as e:
  46. print_error(f"读取配置文件失败: {e}")
  47. return None
  48. def collect_database_files(db_dir: Path) -> List[Path]:
  49. """收集所有数据库相关文件"""
  50. if not db_dir.exists():
  51. return []
  52. # 数据库文件扩展名
  53. db_extensions = {
  54. '.sql', '.ddl', '.dml', '.plsql', '.psql',
  55. '.mysql', '.pgsql', '.sqlite', '.db',
  56. '.mdb', '.accdb', '.dbf'
  57. }
  58. db_files = []
  59. # 递归搜索数据库文件
  60. for file_path in db_dir.rglob('*'):
  61. if file_path.is_file():
  62. # 检查文件扩展名或包含SQL关键词的文件
  63. if (file_path.suffix.lower() in db_extensions or
  64. 'sql' in file_path.name.lower() or
  65. 'database' in file_path.name.lower() or
  66. 'schema' in file_path.name.lower()):
  67. if not should_exclude_file(file_path):
  68. db_files.append(file_path)
  69. # 按相对路径排序,确保一致的输出顺序
  70. db_files.sort(key=lambda x: str(x.relative_to(db_dir)).lower())
  71. return db_files
  72. def should_exclude_file(file_path: Path) -> bool:
  73. """判断是否应该排除某个文件"""
  74. exclude_patterns = [
  75. '.git',
  76. '.svn',
  77. '.log',
  78. '.tmp',
  79. '.temp',
  80. '.bak',
  81. '.backup'
  82. ]
  83. file_str = str(file_path).lower()
  84. for pattern in exclude_patterns:
  85. if pattern in file_str:
  86. return True
  87. # 排除空文件或过大的文件
  88. try:
  89. file_size = file_path.stat().st_size
  90. if file_size == 0 or file_size > 50 * 1024 * 1024: # 50MB限制
  91. return True
  92. except:
  93. return True
  94. return False
  95. def read_database_file(file_path: Path) -> str:
  96. """安全读取数据库文件内容"""
  97. try:
  98. with open(file_path, 'r', encoding='utf-8') as f:
  99. return f.read()
  100. except UnicodeDecodeError:
  101. # 尝试其他编码
  102. encodings = ['gb2312', 'gbk', 'iso-8859-1', 'latin-1']
  103. for encoding in encodings:
  104. try:
  105. with open(file_path, 'r', encoding=encoding) as f:
  106. return f.read()
  107. except:
  108. continue
  109. print_warning(f"无法读取文件 {file_path.name}: 编码问题")
  110. return f"-- 文件读取失败: {file_path.name} (编码问题)"
  111. except Exception as e:
  112. print_warning(f"无法读取文件 {file_path.name}: {e}")
  113. return f"-- 文件读取失败: {file_path.name}"
  114. def analyze_sql_content(content: str) -> Dict[str, int]:
  115. """分析SQL内容,统计各种语句类型"""
  116. content_upper = content.upper()
  117. stats = {
  118. 'CREATE TABLE': content_upper.count('CREATE TABLE'),
  119. 'CREATE VIEW': content_upper.count('CREATE VIEW'),
  120. 'CREATE INDEX': content_upper.count('CREATE INDEX'),
  121. 'CREATE PROCEDURE': content_upper.count('CREATE PROCEDURE'),
  122. 'CREATE FUNCTION': content_upper.count('CREATE FUNCTION'),
  123. 'INSERT INTO': content_upper.count('INSERT INTO'),
  124. 'UPDATE': content_upper.count('UPDATE'),
  125. 'DELETE FROM': content_upper.count('DELETE FROM'),
  126. 'SELECT': content_upper.count('SELECT'),
  127. 'ALTER TABLE': content_upper.count('ALTER TABLE'),
  128. 'DROP TABLE': content_upper.count('DROP TABLE'),
  129. }
  130. return {k: v for k, v in stats.items() if v > 0}
  131. def generate_header(config: dict, file_count: int) -> str:
  132. """生成文档头部信息"""
  133. current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
  134. header = f"""
  135. {'-' * 80}
  136. 软件著作权申请材料 - 数据库源代码文档
  137. {'-' * 80}
  138. 软件名称: {config.get('title', '未设置')}
  139. 软件简称: {config.get('short_title', config.get('title', '未设置'))}
  140. 后端技术: {config.get('backend', '未设置')}
  141. 生成模式: {config.get('generation_mode', '未设置')}
  142. 文档生成信息:
  143. - 生成时间: {current_time}
  144. - 数据库文件数量: {file_count}
  145. - 文档类型: 数据库设计和建表语句
  146. - 编码格式: UTF-8
  147. {'-' * 80}
  148. """
  149. return header
  150. def generate_footer() -> str:
  151. """生成文档尾部信息"""
  152. footer = f"""
  153. {'-' * 80}
  154. 文档结束
  155. 生成工具: AI驱动的软件著作权申请材料生成系统 (Python版本)
  156. 生成时间: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}
  157. {'-' * 80}
  158. """
  159. return footer
  160. def merge_database_files():
  161. """主要的数据库文件合并逻辑"""
  162. print_info("🔄 开始拼接数据库源代码...")
  163. # 1. 确定项目根目录
  164. script_dir = Path(__file__).parent.parent.parent
  165. db_dir = script_dir / "output_sourcecode" / "db"
  166. output_dir = script_dir / "output_docs"
  167. output_file = output_dir / "数据库源代码.txt"
  168. print_info(f"项目根目录: {script_dir}")
  169. print_info(f"数据库目录: {db_dir}")
  170. print_info(f"输出文件: {output_file}")
  171. # 2. 检查数据库目录
  172. if not db_dir.exists():
  173. print_error(f"数据库目录不存在: {db_dir}")
  174. print_info("💡 请先生成数据库源代码文件")
  175. return False
  176. # 3. 确保输出目录存在
  177. output_dir.mkdir(parents=True, exist_ok=True)
  178. # 4. 读取项目配置
  179. config = get_project_config()
  180. if not config:
  181. print_warning("无法读取项目配置,使用默认配置")
  182. config = {
  183. 'title': '软件系统',
  184. 'short_title': '软件系统',
  185. 'backend': 'Java',
  186. 'generation_mode': 'fast'
  187. }
  188. # 5. 收集数据库文件
  189. db_files = collect_database_files(db_dir)
  190. if not db_files:
  191. print_error(f"在 {db_dir} 中未发现数据库文件")
  192. print_info("💡 请先生成数据库源代码文件")
  193. return False
  194. print_success(f"发现 {len(db_files)} 个数据库文件")
  195. # 6. 按文件类型分组统计
  196. file_stats = {}
  197. for file_path in db_files:
  198. ext = file_path.suffix.lower() or '(无扩展名)'
  199. if ext not in file_stats:
  200. file_stats[ext] = 0
  201. file_stats[ext] += 1
  202. print_info("文件类型统计:")
  203. for ext, count in sorted(file_stats.items()):
  204. print_info(f" {ext}: {count} 个文件")
  205. # 7. 开始合并文件
  206. total_sql_stats = {}
  207. try:
  208. with open(output_file, 'w', encoding='utf-8') as output:
  209. # 写入文档头部
  210. output.write(generate_header(config, len(db_files)))
  211. # 逐个处理数据库文件
  212. for i, db_file in enumerate(db_files, 1):
  213. rel_path = db_file.relative_to(db_dir)
  214. print_info(f"处理文件 {i}/{len(db_files)}: {rel_path}")
  215. # 读取文件内容
  216. content = read_database_file(db_file)
  217. # 分析SQL内容
  218. sql_stats = analyze_sql_content(content)
  219. for stmt_type, count in sql_stats.items():
  220. if stmt_type not in total_sql_stats:
  221. total_sql_stats[stmt_type] = 0
  222. total_sql_stats[stmt_type] += count
  223. # 添加文件分隔标识
  224. separator = f"""
  225. {'=' * 80}
  226. 文件 {i}: {db_file.name}
  227. 文件路径: output_sourcecode/db/{rel_path}
  228. 文件类型: {db_file.suffix or '(无扩展名)'}
  229. 文件大小: {db_file.stat().st_size} 字节
  230. {'=' * 80}
  231. """
  232. output.write(separator)
  233. # 如果有SQL统计信息,添加到分隔符中
  234. if sql_stats:
  235. output.write("SQL语句统计:\n")
  236. for stmt_type, count in sql_stats.items():
  237. output.write(f" {stmt_type}: {count}\n")
  238. output.write("\n")
  239. # 写入文件内容
  240. output.write(content)
  241. # 确保文件内容以换行结束
  242. if content and not content.endswith('\n'):
  243. output.write('\n')
  244. # 添加文件结束标识
  245. output.write(f"\n\n{'=' * 80}\n文件 {i} 结束: {db_file.name}\n{'=' * 80}\n\n")
  246. # 写入总体SQL统计
  247. if total_sql_stats:
  248. output.write(f"""
  249. {'-' * 80}
  250. 整体SQL语句统计
  251. {'-' * 80}
  252. """)
  253. for stmt_type, count in sorted(total_sql_stats.items()):
  254. output.write(f"{stmt_type}: {count}\n")
  255. output.write(f"\n{'-' * 80}\n")
  256. # 写入文档尾部
  257. output.write(generate_footer())
  258. # 8. 输出统计信息
  259. file_size = output_file.stat().st_size
  260. file_size_mb = file_size / (1024 * 1024)
  261. print_success("✅ 数据库源代码拼接完成")
  262. print_info(f"📄 输出文件: {output_file}")
  263. print_info(f"📊 文件统计:")
  264. print_info(f" - 数据库文件数量: {len(db_files)}")
  265. print_info(f" - 总文件大小: {file_size:,} 字节 ({file_size_mb:.2f} MB)")
  266. if total_sql_stats:
  267. print_info("📊 SQL语句统计:")
  268. for stmt_type, count in sorted(total_sql_stats.items()):
  269. print_info(f" - {stmt_type}: {count}")
  270. # 9. 生成详细报告
  271. with open(output_dir / "数据库拼接报告.txt", 'w', encoding='utf-8') as report:
  272. report.write(f"数据库源代码拼接报告\n")
  273. report.write(f"生成时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
  274. report.write(f"文件类型统计:\n")
  275. for ext, count in sorted(file_stats.items()):
  276. report.write(f" {ext}: {count} 个文件\n")
  277. if total_sql_stats:
  278. report.write(f"\nSQL语句统计:\n")
  279. for stmt_type, count in sorted(total_sql_stats.items()):
  280. report.write(f" {stmt_type}: {count}\n")
  281. report.write(f"\n文件列表:\n")
  282. for i, db_file in enumerate(db_files, 1):
  283. rel_path = db_file.relative_to(db_dir)
  284. file_size = db_file.stat().st_size
  285. report.write(f"{i:3d}. {rel_path} ({file_size:,} 字节)\n")
  286. total_size = sum(f.stat().st_size for f in db_files)
  287. report.write(f"\n总计: {len(db_files)} 个文件,{total_size:,} 字节\n")
  288. print_success("📋 生成详细报告: 数据库拼接报告.txt")
  289. return True
  290. except Exception as e:
  291. print_error(f"文件合并过程中发生错误: {e}")
  292. return False
  293. def main():
  294. """主函数"""
  295. if len(sys.argv) > 1 and sys.argv[1] in ['-h', '--help']:
  296. print("数据库源代码拼接脚本 (Python版本)")
  297. print("\n用法:")
  298. print(" python3 merge_database_simple.py")
  299. print("\n说明:")
  300. print(" 将 output_sourcecode/db/ 目录下的所有数据库文件")
  301. print(" 拼接成单一的源代码文档用于软著申请")
  302. print("\n支持的文件类型:")
  303. print(" - .sql, .ddl, .dml - SQL脚本文件")
  304. print(" - .plsql, .psql - 存储过程文件")
  305. print(" - .mysql, .pgsql - 数据库特定文件")
  306. print(" - database_schema.sql - 建表语句")
  307. print("\n输出:")
  308. print(" output_docs/数据库源代码.txt")
  309. print(" output_docs/数据库拼接报告.txt")
  310. return
  311. success = merge_database_files()
  312. sys.exit(0 if success else 1)
  313. if __name__ == "__main__":
  314. main()