Exemple #1
0
 private async Task PerformSplit()
 {
     try
     {
         Trace.Indent();
         var maxWordLength = int.Parse(IFdata.GetField("maxWordLength"));
         var probRatio     = double.Parse(IFdata.GetField("probRatio"));
         var bemsRatio     = double.Parse(IFdata.GetField("bemsRatio"));
         CHBsplit.Enabled = false;
         string fileName = "output.txt";
         var    encoding = Encoding.UTF8;
         using (var writer = new StreamWriter(fileName, false, encoding))
         {
             TXBout.Clear();
             var d = new SentenceSplitter.WordIdentifiedEventHandler((word) =>
             {
                 writer.WriteLine(word);
                 TXBout.Invoke(new Action(() =>
                 {
                     if (TXBout.TextLength < 10000)
                     {
                         TXBout.AppendText($"{word}\r\n");
                         if (TXBout.TextLength >= 10000)
                         {
                             TXBout.AppendText("......(Cut)\r\n");
                         }
                     }
                 }));
             });
             var ss = new SentenceSplitter(trie, baseDataLength);
             try
             {
                 ss.WordIdentified += d;
                 Trace.WriteLine("Splitting...");
                 var  mainInputs = string.IsNullOrWhiteSpace(TXBdata.Text) ? (txbDataFileContent != null ? txbDataFileContent : data) : TXBdata.Text;
                 var  inputs     = mainInputs.Split(' ', '\r', '\n', '\t');
                 long cnt        = 0;
                 foreach (var input in inputs)
                 {
                     cnt += (await ss.SplitAsync(
                                 input,
                                 maxWordLength,
                                 probType,
                                 true)).Count;
                 }
                 Trace.WriteLine($"{cnt} words identified.");
             }
             catch (Exception error) { TXBout.Text = error.ToString(); }
             finally { ss.WordIdentified -= d; }
             writer.Close();
         }
     }
     catch (Exception error) { TXBout.Text = error.ToString(); }
     finally { Trace.Unindent(); CHBsplit.CheckState = CheckState.Indeterminate; CHBsplit.Enabled = true; }
 }
 public SATabPage() : base("SA")
 {
     //InitializeComponent();
     TLPmain.Controls.Add(TLPtop, 0, 0);
     {
         TLPtop.Controls.Add(TXBin, 0, 0);
         TLPtop.Controls.Add(new MyPanel()
         {
             Controls = { TLPctrl }, AutoScroll = true, Dock = DockStyle.Top
         }, 1, 0);
         {
             //TLPctrl.SetRowSpan(TXBin, TLPctrl.RowCount);
             int row = 0;
             TLPctrl.Controls.Add(CBmethod, 0, row++);
             {
                 CBmethod.Items.Add("Count Word");
                 CBmethod.Items.Add("List Words");
                 CBmethod.Items.Add("Send Socket");
                 CBmethod.Items.Add("Cut by Code");
             }
             TLPctrl.Controls.Add(BTNexportSA, 0, row++);
             TLPctrl.Controls.Add(BTNsave, 0, row++);
             TLPctrl.Controls.Add(BTNload, 0, row++);
             TLPctrl.Controls.Add(BTNexportList, 0, row++);
             TLPctrl.Controls.Add(BTNnew, 0, row++);
             TLPctrl.Controls.Add(CHBdebugMode, 0, row++);
             TLPctrl.Controls.Add(CHBreplaceWithEmptyExceptChinese, 0, row++);
             TLPctrl.Controls.Add(CHBremoveEmpty, 0, row++);
             TLPctrl.Controls.Add(CHBverbose, 0, row++);
             TLPctrl.Controls.Add(IFdata, 0, row++);
             {
                 IFdata.AddField("maxWordLength", maxWordLength.ToString());
             }
             TLPctrl.Controls.Add(CBprobType, 0, row++);
             {
                 foreach (var s in SentenceSplitter.probTypeString.Split('\n'))
                 {
                     CBprobType.Items.Add(s);
                 }
                 CBprobType.SelectedValueChanged += (sender, e) =>
                 {
                     probType = Enum.GetValues(typeof(SentenceSplitter.ProbTypeEnum)).Cast <SentenceSplitter.ProbTypeEnum>().FirstOrDefault(v => CBprobType.Text.IndexOf($"probType == ProbTypeEnum.{v}") != -1);
                     //MessageBox.Show(probType.ToString());
                 };
             }
             TLPctrl.Controls.Add(CHBsplit, 0, row++);
         }
     }
     TLPmain.Controls.Add(TXBout, 0, 1);
     TLPmain.Controls.Add(TXBdata, 0, 2);
     //TXBdata.TextChanged += TXBdata_TextChanged;
     TXBdata.MouseDoubleClick += TXBdata_MouseDoubleClick;
     TXBin.TextChanged        += TXBin_TextChanged;
     TXBin.KeyDown            += TXBin_KeyDown;
     TXBin.ContextMenu         = new ContextMenu(new[] { new MenuItem("sample code", delegate { TXBin.Text = sampleCode; }) });
     BTNexportSA.Click        += BTNexportSA_Click;
     BTNexportList.Click      += BTNexportList_Click;
     CHBsplit.CheckedChanged  += CHBsplit_CheckedChanged;
     BTNsave.Click            += BTNsave_Click;
     BTNload.Click            += BTNload_Click;
     BTNnew.Click             += BTNnew_Click;
     this.Controls.Add(TLPmain);
     //sam = new SAM();
     //sam.StatusChanged += (s) => { this.Invoke(new Action(() => this.Text = $"[*] {s}")); };
     //sm = new SimpleMethod();
     //sm.StatusChanged += (s) => { this.Invoke(new Action(() => this.Text = $"[*] {s}")); };
     //sa.StatusChanged += (s) => { this.Invoke(new Action(() => this.Text = $"[*] {s}")); };
     sa = new SuffixArray();
     ss = new SentenceSplitter(sa);
     StartServices();
 }
        async Task <string> CutByCode(string dataInput)//the method: double(double C,double E) //count, entropy, return score
        {
            var counter = System.Threading.Interlocked.Increment(ref counter_CutByCode);

            try
            {
                await SemaphoreSlim_CutByCode.WaitAsync();

                if (counter != System.Threading.Interlocked.Read(ref counter_CutByCode))
                {
                    return(null);
                }
                const string namespaceName = "WikiDataAnalysis", className = "FooClass", methodName = "FooMethod";
                string       code =
                    "using System;" +
                    $"namespace {namespaceName}" +
                    "{" +
                    $"   class {className}" +
                    "   {" +
                    $"       public static double {methodName}(string S,int N,Func<string,int> C)" +
                    "       {" +
                    $"           {dataInput}" +
                    "       }" +
                    "   }" +
                    "}";
                System.Reflection.MethodInfo methodInfo;
                try
                {
                    Trace.Indent();
                    Trace.WriteLine($"Compiling... code length = {code.Length}");
                    methodInfo = Utils.DynamicCompile.GetMethod(code, namespaceName, className, methodName, "System");
                    var method = new Func <string, int, Func <string, int>, double>((s, n, c) => (double)methodInfo.Invoke(null, new object[] { s, n, c }));
                    Trace.WriteLine("Splitting...");
                    var           maxWordLength = int.Parse(IFdata.GetField("maxWordLength"));
                    var           probRatio     = double.Parse(IFdata.GetField("probRatio"));
                    var           bemsRatio     = double.Parse(IFdata.GetField("bemsRatio"));
                    StringBuilder sb_ret        = new StringBuilder();
                    long          cnt           = 0;
                    await Task.Run(() =>
                    {
                        var mainInputs = string.IsNullOrWhiteSpace(TXBdata.Text) ? (txbDataFileContent != null ? txbDataFileContent : data) : TXBdata.Text;
                        var inputs     = mainInputs.Split(' ', '\r', '\n', '\t');
                        if (ss_CutByCode == null)
                        {
                            ss_CutByCode = new SentenceSplitter(sa);
                        }
                        const int maxoutputLength = 10000;
                        bool appending            = true;
                        int progress       = 0, total_progress = inputs.Length;
                        var lastUpdateTime = DateTime.MinValue;
                        foreach (var input in inputs)
                        {
                            ++progress;
                            if ((DateTime.Now - lastUpdateTime).TotalSeconds > 0.5)
                            {
                                Trace.WriteLine($"Splitting... {progress}/{total_progress}");
                                lastUpdateTime = DateTime.Now;
                            }
                            var cutResult = ss_CutByCode.Split(input, maxWordLength, method, false);
                            cnt          += cutResult.Count;
                            if (sb_ret.Length + cutResult.Sum(s => (long)s.Length) > maxoutputLength)
                            {
                                appending = false;
                            }
                            if (appending)
                            {
                                sb_ret.AppendLine(string.Join(" ", cutResult));
                            }
                        }
                    });

                    Trace.WriteLine($"{cnt} words identified.");
                    return(sb_ret.ToString());
                }
                catch (Exception error) { return(error.ToString()); }
                finally { Trace.Unindent(); }
            }
            finally { lock (SemaphoreSlim_CutByCode) SemaphoreSlim_CutByCode.Release(); }
        }
Exemple #4
0
        private async void BTNiteration_Click(object sender, EventArgs e)
        {
            try
            {
                Trace.Indent();
                int iterCount = int.Parse(Microsoft.VisualBasic.Interaction.InputBox("Iteration count?", "", "1"));
                for (int iterIdx = 0; iterIdx < iterCount; iterIdx++)
                {
                    Trace.Unindent();
                    Trace.Indent();
                    var iterationStatus = $"Iteration: {iterIdx + 1}/{iterCount}";
                    TXBout.AppendText(iterationStatus + "\r\n");
                    Trace.WriteLine(iterationStatus);
                    try
                    {
                        Trace.Indent();
                        var            maxWordLength = int.Parse(IFdata.GetField("maxWordLength"));
                        var            probRatio     = double.Parse(IFdata.GetField("probRatio"));
                        var            bemsRatio     = double.Parse(IFdata.GetField("bemsRatio"));
                        var            words         = new List <string>();
                        var            ss            = new SentenceSplitter(trie, baseDataLength);
                        List <FPLtype> fpl           = null;
                        Trace.WriteLine("Getting FPL...");
                        await Task.Run(() => fpl = SentenceSplitter.MethodsForTrie.FrequencyPerLength(trie));

                        string[] ddd = null;
                        Trace.WriteLine("Preprocessing data...");
                        var data = string.IsNullOrWhiteSpace(TXBdata.Text) ? (txbDataFileContent != null ? txbDataFileContent : this.data) : TXBdata.Text;
                        await Task.Run(() => ddd = data.Split(' '));

                        {
                            int    progress = 0, total_progress = ddd.Length, percent = -1;
                            object syncRoot = new object();
                            Trace.WriteLine("Splitting...");
                            await Task.Run(() => Parallel.For(0, (ddd.Length + 9) / 10, _ =>
                            {
                                List <string> ans = new List <string>();
                                for (int i = _ * 10; i < (_ + 1) * 10 && i < ddd.Length; i++)
                                {
                                    {
                                        var p = System.Threading.Interlocked.Increment(ref progress) * 1000L / total_progress;
                                        if (p > percent)
                                        {
                                            percent = (int)p;
                                            Trace.WriteLine($"Splitting... {0.1 * percent}%");
                                        }
                                    }
                                    ans.AddRange(ss.Split(
                                                     ddd[i],
                                                     maxWordLength,
                                                     fpl,
                                                     probType,
                                                     false));
                                }
                                lock (syncRoot) words.AddRange(ans);
                            }));

                            Trace.Assert(progress == total_progress);
                        }
                        Trace.WriteLine($"{words.Count} words / {data.Length} chars identified.");
                        TXBout.Text = iterationStatus + "\r\n";
                        for (int i = 0; i < 1000 && i < words.Count; i++)
                        {
                            TXBout.AppendText(words[i] + " ");
                        }
                        var decayRatio = double.Parse(IFdata.GetField("decayRatio"));
                        await Task.Run(() =>
                        {
                            Trace.WriteLine($"Decaying... ratio = {decayRatio}");
                            long cnt = 0;
                            trie.Traverse(c => { }, () => { }, c => cnt += c);
                            Trace.Write($"\t{cnt}→");
                            trie.Decay(decayRatio);
                            cnt = 0;
                            trie.Traverse(c => { }, () => { }, c => cnt += c);
                            Trace.Write($"{cnt} OK");
                            try
                            {
                                Trace.Indent();
                                int progress = 0, total_progress = words.Count, percent = -1;
                                foreach (var word in words)
                                {
                                    if (++progress * 100L / total_progress > percent)
                                    {
                                        Trace.WriteLine($"{words.Count} words / {data.Length} chars inserted. {++percent}%");
                                    }
                                    trie.Insert(word);
                                }
                            }
                            finally { Trace.Unindent(); }
                        });

                        Trace.WriteLine("Saving Trie...");
                        var fileName = $"Trie {DateTime.Now.ToString("yyyy-MM-dd HH-mm-ss.fffffff")}.sav";
                        using (var stream = new FileStream(fileName, FileMode.Create))
                        {
                            await Task.Run(() => trie.Save(stream));
                        }
                        Trace.Unindent();
                        Trace.Indent();
                        Trace.WriteLine("OK: " + fileName);
                    }
                    catch (Exception error) { TXBout.Text = error.ToString(); }
                    finally { Trace.Unindent(); }
                }
                TXBout.AppendText("\r\nOK");
            }
            catch (Exception error) { TXBout.Text = error.ToString(); }
            finally { Trace.Unindent(); }
        }