private async Task PerformSplit() { try { Trace.Indent(); var maxWordLength = int.Parse(IFdata.GetField("maxWordLength")); var probRatio = double.Parse(IFdata.GetField("probRatio")); var bemsRatio = double.Parse(IFdata.GetField("bemsRatio")); CHBsplit.Enabled = false; string fileName = "output.txt"; var encoding = Encoding.UTF8; using (var writer = new StreamWriter(fileName, false, encoding)) { TXBout.Clear(); var d = new SentenceSplitter.WordIdentifiedEventHandler((word) => { writer.WriteLine(word); TXBout.Invoke(new Action(() => { if (TXBout.TextLength < 10000) { TXBout.AppendText($"{word}\r\n"); if (TXBout.TextLength >= 10000) { TXBout.AppendText("......(Cut)\r\n"); } } })); }); var ss = new SentenceSplitter(trie, baseDataLength); try { ss.WordIdentified += d; Trace.WriteLine("Splitting..."); var mainInputs = string.IsNullOrWhiteSpace(TXBdata.Text) ? (txbDataFileContent != null ? txbDataFileContent : data) : TXBdata.Text; var inputs = mainInputs.Split(' ', '\r', '\n', '\t'); long cnt = 0; foreach (var input in inputs) { cnt += (await ss.SplitAsync( input, maxWordLength, probType, true)).Count; } Trace.WriteLine($"{cnt} words identified."); } catch (Exception error) { TXBout.Text = error.ToString(); } finally { ss.WordIdentified -= d; } writer.Close(); } } catch (Exception error) { TXBout.Text = error.ToString(); } finally { Trace.Unindent(); CHBsplit.CheckState = CheckState.Indeterminate; CHBsplit.Enabled = true; } }
public SATabPage() : base("SA") { //InitializeComponent(); TLPmain.Controls.Add(TLPtop, 0, 0); { TLPtop.Controls.Add(TXBin, 0, 0); TLPtop.Controls.Add(new MyPanel() { Controls = { TLPctrl }, AutoScroll = true, Dock = DockStyle.Top }, 1, 0); { //TLPctrl.SetRowSpan(TXBin, TLPctrl.RowCount); int row = 0; TLPctrl.Controls.Add(CBmethod, 0, row++); { CBmethod.Items.Add("Count Word"); CBmethod.Items.Add("List Words"); CBmethod.Items.Add("Send Socket"); CBmethod.Items.Add("Cut by Code"); } TLPctrl.Controls.Add(BTNexportSA, 0, row++); TLPctrl.Controls.Add(BTNsave, 0, row++); TLPctrl.Controls.Add(BTNload, 0, row++); TLPctrl.Controls.Add(BTNexportList, 0, row++); TLPctrl.Controls.Add(BTNnew, 0, row++); TLPctrl.Controls.Add(CHBdebugMode, 0, row++); TLPctrl.Controls.Add(CHBreplaceWithEmptyExceptChinese, 0, row++); TLPctrl.Controls.Add(CHBremoveEmpty, 0, row++); TLPctrl.Controls.Add(CHBverbose, 0, row++); TLPctrl.Controls.Add(IFdata, 0, row++); { IFdata.AddField("maxWordLength", maxWordLength.ToString()); } TLPctrl.Controls.Add(CBprobType, 0, row++); { foreach (var s in SentenceSplitter.probTypeString.Split('\n')) { CBprobType.Items.Add(s); } CBprobType.SelectedValueChanged += (sender, e) => { probType = Enum.GetValues(typeof(SentenceSplitter.ProbTypeEnum)).Cast <SentenceSplitter.ProbTypeEnum>().FirstOrDefault(v => CBprobType.Text.IndexOf($"probType == ProbTypeEnum.{v}") != -1); //MessageBox.Show(probType.ToString()); }; } TLPctrl.Controls.Add(CHBsplit, 0, row++); } } TLPmain.Controls.Add(TXBout, 0, 1); TLPmain.Controls.Add(TXBdata, 0, 2); //TXBdata.TextChanged += TXBdata_TextChanged; TXBdata.MouseDoubleClick += TXBdata_MouseDoubleClick; TXBin.TextChanged += TXBin_TextChanged; TXBin.KeyDown += TXBin_KeyDown; TXBin.ContextMenu = new ContextMenu(new[] { new MenuItem("sample code", delegate { TXBin.Text = sampleCode; }) }); BTNexportSA.Click += BTNexportSA_Click; BTNexportList.Click += BTNexportList_Click; CHBsplit.CheckedChanged += CHBsplit_CheckedChanged; BTNsave.Click += BTNsave_Click; BTNload.Click += BTNload_Click; BTNnew.Click += BTNnew_Click; this.Controls.Add(TLPmain); //sam = new SAM(); //sam.StatusChanged += (s) => { this.Invoke(new Action(() => this.Text = $"[*] {s}")); }; //sm = new SimpleMethod(); //sm.StatusChanged += (s) => { this.Invoke(new Action(() => this.Text = $"[*] {s}")); }; //sa.StatusChanged += (s) => { this.Invoke(new Action(() => this.Text = $"[*] {s}")); }; sa = new SuffixArray(); ss = new SentenceSplitter(sa); StartServices(); }
async Task <string> CutByCode(string dataInput)//the method: double(double C,double E) //count, entropy, return score { var counter = System.Threading.Interlocked.Increment(ref counter_CutByCode); try { await SemaphoreSlim_CutByCode.WaitAsync(); if (counter != System.Threading.Interlocked.Read(ref counter_CutByCode)) { return(null); } const string namespaceName = "WikiDataAnalysis", className = "FooClass", methodName = "FooMethod"; string code = "using System;" + $"namespace {namespaceName}" + "{" + $" class {className}" + " {" + $" public static double {methodName}(string S,int N,Func<string,int> C)" + " {" + $" {dataInput}" + " }" + " }" + "}"; System.Reflection.MethodInfo methodInfo; try { Trace.Indent(); Trace.WriteLine($"Compiling... code length = {code.Length}"); methodInfo = Utils.DynamicCompile.GetMethod(code, namespaceName, className, methodName, "System"); var method = new Func <string, int, Func <string, int>, double>((s, n, c) => (double)methodInfo.Invoke(null, new object[] { s, n, c })); Trace.WriteLine("Splitting..."); var maxWordLength = int.Parse(IFdata.GetField("maxWordLength")); var probRatio = double.Parse(IFdata.GetField("probRatio")); var bemsRatio = double.Parse(IFdata.GetField("bemsRatio")); StringBuilder sb_ret = new StringBuilder(); long cnt = 0; await Task.Run(() => { var mainInputs = string.IsNullOrWhiteSpace(TXBdata.Text) ? (txbDataFileContent != null ? txbDataFileContent : data) : TXBdata.Text; var inputs = mainInputs.Split(' ', '\r', '\n', '\t'); if (ss_CutByCode == null) { ss_CutByCode = new SentenceSplitter(sa); } const int maxoutputLength = 10000; bool appending = true; int progress = 0, total_progress = inputs.Length; var lastUpdateTime = DateTime.MinValue; foreach (var input in inputs) { ++progress; if ((DateTime.Now - lastUpdateTime).TotalSeconds > 0.5) { Trace.WriteLine($"Splitting... {progress}/{total_progress}"); lastUpdateTime = DateTime.Now; } var cutResult = ss_CutByCode.Split(input, maxWordLength, method, false); cnt += cutResult.Count; if (sb_ret.Length + cutResult.Sum(s => (long)s.Length) > maxoutputLength) { appending = false; } if (appending) { sb_ret.AppendLine(string.Join(" ", cutResult)); } } }); Trace.WriteLine($"{cnt} words identified."); return(sb_ret.ToString()); } catch (Exception error) { return(error.ToString()); } finally { Trace.Unindent(); } } finally { lock (SemaphoreSlim_CutByCode) SemaphoreSlim_CutByCode.Release(); } }
private async void BTNiteration_Click(object sender, EventArgs e) { try { Trace.Indent(); int iterCount = int.Parse(Microsoft.VisualBasic.Interaction.InputBox("Iteration count?", "", "1")); for (int iterIdx = 0; iterIdx < iterCount; iterIdx++) { Trace.Unindent(); Trace.Indent(); var iterationStatus = $"Iteration: {iterIdx + 1}/{iterCount}"; TXBout.AppendText(iterationStatus + "\r\n"); Trace.WriteLine(iterationStatus); try { Trace.Indent(); var maxWordLength = int.Parse(IFdata.GetField("maxWordLength")); var probRatio = double.Parse(IFdata.GetField("probRatio")); var bemsRatio = double.Parse(IFdata.GetField("bemsRatio")); var words = new List <string>(); var ss = new SentenceSplitter(trie, baseDataLength); List <FPLtype> fpl = null; Trace.WriteLine("Getting FPL..."); await Task.Run(() => fpl = SentenceSplitter.MethodsForTrie.FrequencyPerLength(trie)); string[] ddd = null; Trace.WriteLine("Preprocessing data..."); var data = string.IsNullOrWhiteSpace(TXBdata.Text) ? (txbDataFileContent != null ? txbDataFileContent : this.data) : TXBdata.Text; await Task.Run(() => ddd = data.Split(' ')); { int progress = 0, total_progress = ddd.Length, percent = -1; object syncRoot = new object(); Trace.WriteLine("Splitting..."); await Task.Run(() => Parallel.For(0, (ddd.Length + 9) / 10, _ => { List <string> ans = new List <string>(); for (int i = _ * 10; i < (_ + 1) * 10 && i < ddd.Length; i++) { { var p = System.Threading.Interlocked.Increment(ref progress) * 1000L / total_progress; if (p > percent) { percent = (int)p; Trace.WriteLine($"Splitting... {0.1 * percent}%"); } } ans.AddRange(ss.Split( ddd[i], maxWordLength, fpl, probType, false)); } lock (syncRoot) words.AddRange(ans); })); Trace.Assert(progress == total_progress); } Trace.WriteLine($"{words.Count} words / {data.Length} chars identified."); TXBout.Text = iterationStatus + "\r\n"; for (int i = 0; i < 1000 && i < words.Count; i++) { TXBout.AppendText(words[i] + " "); } var decayRatio = double.Parse(IFdata.GetField("decayRatio")); await Task.Run(() => { Trace.WriteLine($"Decaying... ratio = {decayRatio}"); long cnt = 0; trie.Traverse(c => { }, () => { }, c => cnt += c); Trace.Write($"\t{cnt}→"); trie.Decay(decayRatio); cnt = 0; trie.Traverse(c => { }, () => { }, c => cnt += c); Trace.Write($"{cnt} OK"); try { Trace.Indent(); int progress = 0, total_progress = words.Count, percent = -1; foreach (var word in words) { if (++progress * 100L / total_progress > percent) { Trace.WriteLine($"{words.Count} words / {data.Length} chars inserted. {++percent}%"); } trie.Insert(word); } } finally { Trace.Unindent(); } }); Trace.WriteLine("Saving Trie..."); var fileName = $"Trie {DateTime.Now.ToString("yyyy-MM-dd HH-mm-ss.fffffff")}.sav"; using (var stream = new FileStream(fileName, FileMode.Create)) { await Task.Run(() => trie.Save(stream)); } Trace.Unindent(); Trace.Indent(); Trace.WriteLine("OK: " + fileName); } catch (Exception error) { TXBout.Text = error.ToString(); } finally { Trace.Unindent(); } } TXBout.AppendText("\r\nOK"); } catch (Exception error) { TXBout.Text = error.ToString(); } finally { Trace.Unindent(); } }