今天写一个脚本文件,需要将多个文件中的内容汇总到一个txt文件中,由于多个文件有三种不同的编码方式,读写出现错误,先将解决方法记录如下:
- 今天写一个脚本文件,需要将多个文件中的内容汇总到一个txt文件中,由于多个文件有三种不同的编码方式,读写出现错误,先将解决方法记录如下:
- # -*- coding: utf-8 -*-
- import wave
- import pylab as pl
- import numpy as np
- import pandas as pd
- import os
- import time
- import datetime
- import arrow
- import chardet
- import sys
- reload(sys)
- sys.setdefaultencoding('utf8')
- os.chdir("F:/new_srt")
- #get words of srt file
- ###########################################
- def get_word():
- path = "F:/new_srt"
- filelist = os.listdir(path)
- for files in filelist:
- print files
- encoding = chardet.detect(open(files,'r').read())['encoding']
- if encoding == 'utf-8':
- data=pd.read_csv(files,encoding="utf-8",sep='\r',header=None)
- elif encoding == 'GB2312':
- try:
- data=pd.read_csv(files,encoding="gbk",sep='\r',header=None)
- except UnicodeDecodeError:
- data=pd.read_csv(files,encoding="utf-8",sep='\r',header=None)
- elif encoding == 'UTF-8-SIG':
- data=pd.read_csv(files,encoding="UTF-8-SIG",sep='\r',header=None)
- else:
- print 'this is an error about %s' % files
- data_new=pd.DataFrame(np.reshape(data.values, (-1,3)))
- data_new.columns=['index','timecut','content']
- filename = os.path.splitext(files)[0] #filetype = os.path.splitext(files)[1]
- with open('F:/result.txt', 'a') as file:
- file.write(str(filename)+' ' )
- for item in data_new['content']:
- file.write(item.decode("utf-8") +' ') #s=s.decode("utf-8")
- file.write('\n')
- if __name__ == '__main__':
- get_word()