Java实现简体中文转繁体中文的工具(包括编码转换和语义转换)

简体中文转繁体中文的工具,包括:1、编码转换(GBK->big5) 2、语义转换(根据词库,,需要词库的请EMail联系我)

package i18n.converter;import java.io.BufferedReader;import java.io.BufferedWriter;import java.io.ByteArrayInputStream;import java.io.File;import java.io.FileInputStream;import java.io.FileOutputStream;import java.io.IOException;import java.io.InputStream;import java.io.InputStreamReader;import java.io.OutputStreamWriter;import java.util.Enumeration;import java.util.Hashtable;import java.util.Vector;/*** Author: zhusheng3@126.com*/public class Gbk2Big5Converter {protected Hashtable s2thash = new Hashtable();static String[] dictFiles = new String[] { “mappings_gbk2big5_1-1.txt”,”mappings_gbk2big5_phrase.txt” };String dataline;public Gbk2Big5Converter() {s2thash = getHashDict();}/** 根据GBK的词典生成简转繁的对应关系,包括词组和单字*/public Hashtable getHashDict() {Hashtable hashDict = new Hashtable();BufferedReader br = null;for (String filename : dictFiles) {try {InputStream dictStream = getClass().getResourceAsStream(filename);br = new BufferedReader(new InputStreamReader(dictStream, “gbk”));String line = null;while ((line = br.readLine()) != null) {if (line.length() < 3 || line.charAt(0) == '#') {continue;}int idx = line.indexOf(“,”);if (idx > 0) {String src = line.substring(0, idx).trim();String tgt = line.substring(idx + 1).trim();if (hashDict.get(src) != null) {hashDict.remove(src);}hashDict.put(src, tgt);}}} catch (Exception ex) {ex.printStackTrace();} finally {if (br != null) {try {br.close();} catch (IOException e) {}}}}return hashDict;}/** 利用词典对一个字符串进行替换*/public String convertString(String inline) {StringBuffer outline = new StringBuffer(inline);convertStringBuffer(outline);return outline.toString();}/** 利用词典对一个StringBuffer进行替换*/public void convertStringBuffer(StringBuffer dataline) {String lin = dataline.toString();// System.out.println(“before:” + lin);int startPostion = 0;String currchar;char charvalue;for (int beginChar = startPostion; beginChar <= dataline.length(); beginChar++) {String newStr = “”;// System.out.println(“开始位置beginChar:” + beginChar);// 在该位置下的子串最长度int maxLengthOfSubstr = dataline.length() – beginChar;// System.out.println(“在该位置下的子串最大长度:” + maxLengthOfSubstr);// 找出所有子串for (int currentLen = maxLengthOfSubstr; currentLen >= 1; currentLen–) {// 英文字符不用匹配,直接跳出if (isSingleByte(dataline.substring(beginChar, beginChar + 1))) {// System.out.println(inputString.substring(beginChar,beginChar+1));break;}// 取得当前子串if (beginChar + currentLen <= dataline.length()) {// 当前子串String subStr = dataline.substring(beginChar, beginChar+ currentLen);// System.out.println(“当前子串:” + subStr);if (s2thash.get(subStr) != null) {// System.out.println(“找到匹配:” + subStr + “->”+// s2thash.get(subStr));newStr = s2thash.get(subStr).toString();dataline.replace(beginChar, beginChar + currentLen,s2thash.get(subStr).toString());String after = dataline.toString();// System.out.println(“本次替换后的字符串:” + after);if (beginChar + newStr.length() < dataline.length()) {// System.out.println(“替换完成后开始字符:”+// dataline.charAt(beginChar+ newStr.length()));} else {// System.out.println(“本字符串没有新字符可以替换了!”);}beginChar = beginChar + newStr.length() – 1;// 找到匹配后,就不用继续往下找本起始字符下的更短的字符串了// System.out.println(“找到匹配后,就不用继续往下找本起始字符下的更短的字符串了”);break;}}}if (beginChar >= dataline.length())break;}// System.out.println(“after:” + dataline.toString());}/** 把目标文件或者文件夹(sourcedir,gbk编码)转成big5编码, 并另存为目标文件夹(targetdir,big5编码)*/public void convertFile(String sourcedir, String targetdir) {int source_encoding = 0;int target_encoding = 4;BufferedReader srcbuffer;BufferedWriter outbuffer;String dataline;Vector inputfiles = new Vector();Vector outputfiles = new Vector();inputfiles.add(sourcedir);outputfiles.add(targetdir);int i, j, working_encoding;File tmpfile, tmpout;String dirfiles[];for (i = 0; i < inputfiles.size(); i++) {tmpfile = new File((String) inputfiles.get(i));if (tmpfile.exists() == false) {System.out.println(“ERROR: Source file “+ (String) inputfiles.get(i) + ” does not exist./n”);continue;}if (tmpfile.isDirectory() == true) {tmpout = new File((String) outputfiles.get(i));if (tmpout.exists() == false) {tmpout.mkdir();}dirfiles = tmpfile.list();if (dirfiles != null) {for (j = 0; j < dirfiles.length; j++) {inputfiles.add((String) inputfiles.get(i)+ File.separator + dirfiles[j]);outputfiles.add((String) outputfiles.get(i)+ File.separator + dirfiles[j]);}}continue;}System.out.println(“Converting ” + inputfiles.get(i) + ” to “+ outputfiles.get(i) + ” with encoding ” + source_encoding);try {working_encoding = source_encoding;srcbuffer = new BufferedReader(new InputStreamReader(new FileInputStream((String) inputfiles.get(i)), “gbk”));outbuffer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream((String) outputfiles.get(i)),”big5″));while ((dataline = srcbuffer.readLine()) != null) {outbuffer.write(convertString(dataline));outbuffer.newLine();}srcbuffer.close();outbuffer.close();} catch (Exception ex) {System.err.println(ex);}}}public File convertSimpleString(String inputString) {// System.out.println(“before->inputString:”+inputString);byte[] bytes = inputString.getBytes();StringBuffer sb = new StringBuffer();// write the string to a temp fileFile result = new File(“temp.txt”);try {InputStream inputStream = new ByteArrayInputStream(inputString.getBytes());BufferedReader srcbuffer = new BufferedReader(new InputStreamReader(inputStream, “gbk”));BufferedWriter outbuffer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(result), “big5”));while ((dataline = srcbuffer.readLine()) != null) {outbuffer.write(convertString(dataline));outbuffer.newLine();}srcbuffer.close();outbuffer.close();} catch (Exception e) {e.printStackTrace();}// System.out.println(“after->result:”+result);return result;}/** 判断是否单字节字,中文都不是单字节字*/public static boolean isSingleByte(String inStr) {if (inStr.getBytes().length == inStr.length()) {return true;} else {return false;}}public void printDict() {Gbk2Big5Converter aConverter = new Gbk2Big5Converter();for (int i = 20; i > 0; i–) {Enumeration enums = aConverter.s2thash.keys();while (enums.hasMoreElements()) {String ele = (String) enums.nextElement();if (ele.length() == i) {System.out.print(ele);System.out.println(“,” + aConverter.s2thash.get(ele));}}}}public static void main(String[] args) {Gbk2Big5Converter aGbk2Big5Converter = new Gbk2Big5Converter();String src = “src//resource_zh_CN.properties.org”;String tgt = “src//resource_zh_TW.properties.org”;System.out.println(new File(src).getAbsolutePath());aGbk2Big5Converter.convertFile(src, tgt);}}

我知道我不是一个很好的记录者,但我比任何人都喜欢回首自己来时的路,

Java实现简体中文转繁体中文的工具(包括编码转换和语义转换)

相关文章:

你感兴趣的文章:

标签云: