由隐马尔科夫意淫无字典中文分词 C#

using System;using System;using System.Windows.Forms;using System.IO;using System.Text.RegularExpressions;using System.Collections;using System.Collections.Generic;using System.ComponentModel;namespace HMM{public partial class Form1 : Form{string[] arrayData;DirectoryInfo di;FileInfo[] fis;Hashtable htDict = new Hashtable();double singleCutRate;public Form1(){InitializeComponent();label1.Text = "先预处理!";progressBar1.Visible = false;di = new DirectoryInfo("data");fis = di.GetFiles("*.txt");arrayData = new string[fis.Length];}private void Form1_Resize(object sender, EventArgs e){this.Width = 800;this.Height = 600;}private void button1_Click(object sender, EventArgs e){if (!new FileInfo("dict.txt").Exists){int count = 0;progressBar1.Visible = true;BackgroundWorker worker = new BackgroundWorker();worker.WorkerReportsProgress = true; //报告进度worker.DoWork += (s, o) =>{int progressCount = 1;foreach (FileInfo i in fis){StreamReader sr = new StreamReader(i.FullName, System.Text.Encoding.Default);arrayData[progressCount – 1] = sr.ReadToEnd();sr.Close();worker.ReportProgress((int)((double)progressCount / (double)fis.Length * 33), null);progressCount++;}for (int i = 0; i < arrayData.Length; i++){arrayData[i] = Regex.Replace(arrayData[i], @"[^\u4e00-\u9fa5]", "");for (int j = 0; j < arrayData[i].Length; j++){string strWord = arrayData[i].Substring(j, 1);if (IsChinese(strWord)){if (htDict.ContainsKey(strWord)){htDict[strWord] = ((int)htDict[strWord]) + 1;}else{htDict.Add(strWord, 1);}}}worker.ReportProgress((int)((double)i / (double)arrayData.Length * 33) + 33, null);}StreamWriter sw = new StreamWriter("dict.txt", false, System.Text.Encoding.Default);foreach (DictionaryEntry i in htDict){sw.WriteLine(i.Key + "|" + i.Value);count++;sw.Flush();worker.ReportProgress((int)((double)count / (double)htDict.Count * 33) + 67, null);}sw.Close();};worker.RunWorkerCompleted += (s, o) =>{this.Invoke(new MethodInvoker(() => { progressBar1.Visible = false; progressBar1.Value = 0; label1.Text = "预处理完成!|" + count; }));};worker.ProgressChanged += (s, o) =>{progressBar1.Style = ProgressBarStyle.Continuous;progressBar1.Value = o.ProgressPercentage;};worker.RunWorkerAsync();}else{int count = 0;progressBar1.Visible = true;BackgroundWorker worker = new BackgroundWorker();worker.WorkerReportsProgress = true; //报告进度worker.DoWork += (s, o) =>{int progressCount = 1;foreach (FileInfo i in fis){StreamReader sr = new StreamReader(i.FullName, System.Text.Encoding.Default);arrayData[progressCount – 1] = sr.ReadToEnd();sr.Close();worker.ReportProgress((int)((double)progressCount / (double)fis.Length * 33), null);progressCount++;}for (int i = 0; i < arrayData.Length; i++){arrayData[i] = Regex.Replace(arrayData[i], @"[^\u4e00-\u9fa5]", "");worker.ReportProgress((int)((double)i / (double)arrayData.Length * 33) + 33, null);}StreamReader reader = new StreamReader("dict.txt", System.Text.Encoding.Default);string line = "";while ((line = reader.ReadLine()) != null){htDict[line.Substring(0, 1)] = line.Substring(2);count++;}reader.Close();worker.ReportProgress(100, null);};worker.RunWorkerCompleted += (s, o) =>{this.Invoke(new MethodInvoker(() => { progressBar1.Visible = false; progressBar1.Value = 0; label1.Text = "预处理完成!|" + count; }));};worker.ProgressChanged += (s, o) =>{progressBar1.Style = ProgressBarStyle.Continuous;progressBar1.Value = o.ProgressPercentage;};worker.RunWorkerAsync();}}private void button2_Click(object sender, EventArgs e){if (label1.Text != "先预处理!" && textBox1.Text.Trim() != ""){textBox2.Text = "";if (!double.TryParse(textBox3.Text.Trim(), out singleCutRate)){singleCutRate = 0.01;}List<string> list = new List<string>();string strSplitWords = Regex.Replace(textBox1.Text.Trim(), @"[^\u4e00-\u9fa5]", "");int startPos = 0;int m = 1;string strWord1 = "";string strWord2 = "";progressBar1.Visible = true;BackgroundWorker worker = new BackgroundWorker();worker.WorkerReportsProgress = true; //报告进度worker.DoWork += (s, o) =>{while (strSplitWords.Length >= 2){if (strWord1 == ""){strWord1 = strSplitWords.Substring(startPos, m);}strWord2 = strSplitWords.Substring(startPos, ++m);double x1 = (double)ReturnCount(strWord1, arrayData);double y1 = (double)ReturnTotalCount(strWord1);if (y1 == 0)y1++;double a = x1 / y1;double x2 = (double)ReturnCount(strWord2, arrayData);double y2 = (double)ReturnTotalCount(strWord2);if (y2 == 0)y2++;double b = x2 / y2;if ((a < 1 && a > b) || (a == 1 && b < singleCutRate) || (a == 0 && b == 0)){list.Add(strWord1);startPos += strWord1.Length;worker.ReportProgress((int)((double)startPos / (double)strSplitWords.Length * 100), null);m = 1;strWord1 = "";strWord2 = "";if ((strSplitWords.Length – startPos) == 1){list.Add(strSplitWords.Substring(startPos, 1));break;}else if ((strSplitWords.Length – startPos) < 1){break;}}else{strWord1 = strWord2;strWord2 = "";if ((strSplitWords.Length – startPos – m) < 1){list.Add(strWord1);startPos += strWord1.Length;worker.ReportProgress((int)((double)startPos / (double)strSplitWords.Length * 100), null);break;}}}worker.ReportProgress(100, null);};worker.RunWorkerCompleted += (s, o) =>{this.Invoke(new MethodInvoker(() =>{progressBar1.Visible = false;progressBar1.Value = 0;foreach (string i in list){textBox2.Text += i + "|";}label2.Text = "分词完成!";}));};worker.ProgressChanged += (s, o) =>{progressBar1.Style = ProgressBarStyle.Continuous;progressBar1.Value = o.ProgressPercentage;};worker.RunWorkerAsync();}}public bool IsChinese(string str){return Regex.IsMatch(str, @"^[\u4e00-\u9fa5]+$");}public int ReturnCount(string s, string[] d){int count = 0;for (int i = 0; i < d.Length; i++){int pos = 0;while (true){pos = d[i].IndexOf(s, pos);if (pos != -1){pos++;count++;}else{break;}}}return count;}public int ReturnTotalCount(string s){int total = 0;for (int i = 0; i < s.Length; i++){if (htDict.ContainsKey(s.Substring(i, 1))){total += Convert.ToInt32(htDict[s.Substring(i, 1)]);}}return total;}}}与其在那里苦苦挣扎,碍于面子硬撑,倒不如微笑着面对,

由隐马尔科夫意淫无字典中文分词 C#

相关文章:

你感兴趣的文章:

标签云: