Python文本去重 - ⎝⎛CodingNote.cc ⎞⎠

Python文本去重

2020 年 1 月 6 日
筆記

用法：命令行python unique.py -f file.txt 输出：去除重复字符后的output.txt

# -*- coding:utf-8 -*-  #auther_cclarence_2016_4_6  #open file and read out the characters  from optparse import OptionParser  import sys  reload(sys)  sys.setdefaultencoding("utf-8")  def readfile(filename):      try:          f = open(filename)      except Exception, e:          print "No such file"          exit(0)      text = f.readlines()      f.close()      for i in range(0,len(text)-1):          text[i] = text[i][:-1]      return text  #deduplication  def unique(arr):      arr1 = list(set(arr))      arr1.sort(key = arr.index)      return arr1  def main():      parser = OptionParser()      parser.add_option("-f", "--file", dest="filename",help="write report to FILE", metavar="FILE")      (options, args) = parser.parse_args()      filename = options.filename      text = readfile(filename)      text_dealed = unique(text)      for i in range(0,len(text_dealed)-1):          text_dealed[i] = text_dealed[i] +'n'      f = open("output.txt","w")      f.writelines(text_dealed)      f.close()      deduplication_num = len(text) - len(text_dealed)      print "success"      print "The num of data from the source file        :" + str(len(text))      print "The num of data from the preprocessed file: :" + str(len(text_dealed))      print "The num of data removed                     :" + str(deduplication_num)  if __name__ == '__main__':      main()

Previous post

从零开始学会用Python3做捕鱼达人游

Next post

python 读取文件乱码问题