多线程批量转换文件编码, 从GBK, GB2312编码转换到UTF

# coding=utf-8# author:Jeffrey Ma# version:0.1# build 2# created on:2015年3月31日# description: 1. 批量转换文件编码,从GBK GB2312编码转换到UTF-8编码#2. 支持指定目录下所有的文件的转换,包括子目录中的文件#3. 支持检测原始编码,对已经是UTF-8编码的文件,不做转换#4. 支持只转换指定扩展名的编码#5. 支持多线程转换和控制台输出#6. 支持控制台显示线程池的状态#7. 支持日志记录# usage: python gbk2utf8.py -s [文件路径]# args : 文件的绝对路径# notes : 转换前请备份原始文件,,转换后的文件会覆盖原文件。from __future__ import divisionosimport getoptimport loggingimport logging.configimport Queueimport threadpoolimport threadingfrom threading import Threadfrom multiprocessing.dummy import Pool as ThreadPoolimport chardetimport cursesimport timeimport localelocale.setlocale(locale.LC_ALL, "")global loggerglobal stdscrglobal poolstdscr = curses.initscr()def GBK2UTF8(filename):threadName = threading.currentThread().getName()f = open(filename, ‘rb’)s = f.read()f.close()encodingName = chardet.detect(s)[‘encoding’]str = "";if (encodingName.startswith(‘GB’)):# GBK码,需要转换try:gbkContent = s.decode(encodingName)utf8Content = gbkContent.encode(‘utf-8’)f = open(filename, ‘w’)f.write(utf8Content)f.close()except UnicodeDecodeError:str = "%s: %s, %s 转换出错" % (threadName, filename, encodingName)# logger.error("%s: %s, %s 转换出错" % (threadName, filename, encodingName))# logger.error(‘%s: decoe error %s’ % (threadName, UnicodeDecodeError.reason))passstr = "%s: %s, %s 转换done" % (threadName, filename, encodingName)else:# 已经是UTF-8不需要转换str = "%s: %s, %s 已经是UTF-8不需要转换" % (threadName, filename, encodingName)return {"tName": threadName, "encodingName": encodingName,"filename":filename, "result":str}def initLogger():global logger# 日志初始化LOG_FILENAME = ‘logging.conf’logging.config.fileConfig(LOG_FILENAME)logger = logging.getLogger("GBK2UTF8")# 测试代码 # logger.debug("debug message") # logger.info("info message") # logger.warn("warn message") # logger.error("error message") # logger.critical("critical message")def main():initLogger()shortargs = ‘s:d’longargs = [‘src=’, ‘dest’]try:opts, args = getopt.getopt(sys.argv[1:], shortargs, longargs)except getopt.GetoptError, err:# print help information and exit:(err) # will print something like "option -a not recognized"# usage()print "Usage: python gbk2utf8.py -s [file full path]"return# sys.exit(2)srcPath = NonedestPath = Nonefor o, a in opts:if o in ("-s", "–src"):srcPath = aelif o in ("-d", "–dest"):destPath = aelse:assert False, "unhandled option"os.path.exists(srcPath) and os.path.isdir(srcPath)):doWork(srcPath)def doWork(sPath):# Make the Pool of workersglobal poolpool = threadpool.ThreadPool(10)extFilters = [‘xml’, ‘java’, ‘js’, ‘txt’, ‘css’, ‘php’, ‘html’, ‘htm’, ‘tpl’]i = 0arrFiles = []for root, dirs, files in os.walk(sPath):for file in files:# print root# print filei = i+1sFilePath = root + os.sep + fileextension = os.path.splitext(sFilePath)[1][1:]if (extension in extFilters):arrFiles.append(sFilePath)else:logger.info(‘Skipping %s’ % sFilePath)print ‘waiting…job’curses.noecho()curses.cbreak()requests = threadpool.makeRequests(GBK2UTF8, arrFiles, print_result)[pool.putRequest(req) for req in requests]#close the pool and wait for the work to finishpool.wait()curses.nocbreak()curses.echo()curses.endwin()print ‘end job’def print_result(request, result):try:idx = 0for t in pool.workers:idx = idx+1if(t.getName() == result["tName"]):breakif idx > 0:y, x = stdscr.getmaxyx()# stdscr.deleteln()text = result["result"]textLen = len(text)text = text.ljust(x)stdscr.addstr(idx, 0, text)stdscr.refresh()logger.info(text)except curses.error:passif __name__ == ‘__main__’:main()

父母养我不容易,我在学校争口气。

多线程批量转换文件编码, 从GBK, GB2312编码转换到UTF

相关文章:

你感兴趣的文章:

标签云: