


python split_multimol2.py multi-mol2.mol2 out_dir

注释:python 脚本文件 mol2分子库 输出目录


  1. #Python2 or Python3
  2. #AspirinCode 2018
  3. #Script that splits a multi-mol2 file into individual mol2 files.
  4. #python split_multimol2.py multi-mol2.mol2 out_dir
  5. import sys
  6. import os
  7. def split_multimol2(multimol2):
  8. """
  9. Splits a multi-mol2 file.
  10. Parameters
  11. ----------
  12. multimol2 : str
  13. Path to the multi-mol2 file.
  14. Returns
  15. ----------
  16. A generator object for lists for every extracted mol2-file. Lists contain
  17. the molecule ID and the mol2 file contents.
  18. e.g., ['ID1234', '@<TRIPOS>MOLECULE...'
  19. """
  20. with open(multimol2, 'r') as mol2file:
  21. line = mol2file.readline()
  22. while not mol2file.tell() == os.fstat(mol2file.fileno()).st_size:
  23. if line.startswith("@<TRIPOS>MOLECULE"):
  24. mol2cont = []
  25. mol2cont.append(line)
  26. line = mol2file.readline()
  27. molecule_id = line.strip()
  28. while not line.startswith("@<TRIPOS>MOLECULE"):
  29. mol2cont.append(line)
  30. line = mol2file.readline()
  31. if mol2file.tell() == os.fstat(mol2file.fileno()).st_size:
  32. mol2cont.append(line)
  33. break
  34. mol2cont[-1] = mol2cont[-1].rstrip() # removes blank line at file end
  35. yield [molecule_id, "".join(mol2cont)]
  36. def write_multimol2(multimol2, out_dir):
  37. """
  38. Splits a multi-mol2 file into smaller multi-mol2 files.
  39. Parameters
  40. -----------
  41. multimol2 : str
  42. Path to the multi-mol2 file.
  43. out_dir : str:
  44. Output directory. New files will be named
  45. <molecule_name_1>.mol2, ... <molecule_name_n>.mol2
  46. Returns
  47. -----------
  48. chunks : int
  49. Number of files written.
  50. """
  51. if not out_dir:
  52. os.mkdir(out_dir)
  53. single_mol2s = split_multimol2(args.MOL2_FILE)
  54. for mol2 in single_mol2s:
  55. out_mol2 = os.path.join(args.OUT_DIR, mol2[0]) + '.mol2'
  56. with open(out_mol2, 'w') as out_file:
  57. for line in mol2[1]:
  58. out_file.write(line)
  59. out_file.write('\n')
  60. def write_multimol2_chunks(multimol2, chunk_size, out_dir):
  61. """
  62. Splits a multi-mol2 file into smaller multi-mol2 files.
  63. Parameters
  64. -----------
  65. multimol2 : str
  66. Path to the multi-mol2 file.
  67. chunksize : int
  68. Number of mol2 files per chunk.
  69. out_dir : str:
  70. Output directory. New files will be named
  71. <multimol2>_1.mol2, ... <multimol2>_n.mol2
  72. Returns
  73. -----------
  74. chunks : int
  75. Number of files written.
  76. """
  77. if not os.path.exists(out_dir):
  78. os.mkdir(out_dir)
  79. out_path_stem = os.path.dirname(multimol2)
  80. out_file_stem = os.path.basename(multimol2).split('.mol2')[0]
  81. cnt = 0
  82. chunks = 1
  83. out_file = open(os.path.join(out_dir, out_file_stem)+'_%d.mol2' % chunks, 'w')
  84. for mol2 in split_multimol2(multimol2):
  85. cnt += 1
  86. if cnt == chunk_size:
  87. cnt = 0
  88. chunks += 1
  89. out_file.close()
  90. out_file = open(os.path.join(out_dir, out_file_stem)+'_%d.mol2' % chunks, 'w')
  91. out_file.write(mol2[1] + '\n')
  92. out_file.close()
  93. return chunks
  94. if __name__ == '__main__':
  95. import argparse
  96. parser = argparse.ArgumentParser(
  97. description='Splits a multi-mol2 file into individual mol2 files',
  98. formatter_class=argparse.RawTextHelpFormatter
  99. )
  100. parser.add_argument('MOL2_FILE')
  101. parser.add_argument('OUT_DIR')
  102. parser.add_argument('-c', '--chunksize', help='Number of MOL2 structures per file (1 by default)', type=int)
  103. parser.add_argument('-v', '--version', action='version', version='split_multimol2 v. 1.1')
  104. args = parser.parse_args()
  105. if args.chunksize:
  106. write_multimol2_chunks(multimol2=args.MOL2_FILE, chunk_size=args.chunksize, out_dir=args.OUT_DIR)
  107. else:
  108. write_multimol2(multimol2=args.MOL2_FILE, out_dir=args.OUT_DIR)