private static int CountInsideSA(SuffixArray sa, int startIndex, int length)
            {
                //var s = sa.S.Substring(startIndex, length);
                //return sa.UpperBound(s) - sa.LowerBound(s);
                int l, r, n = sa.S.Length;

                l = r = sa.RANK[startIndex];
                int m = 1;

                for (; l - m >= 0 && string.Compare(sa.S, sa.SA[l], sa.S, sa.SA[l - m], length, StringComparison.Ordinal) == 0; m <<= 1)
                {
                    l -= m;
                }
                for (; m > 0; m >>= 1)
                {
                    if (l - m >= 0 && string.Compare(sa.S, sa.SA[l], sa.S, sa.SA[l - m], length, StringComparison.Ordinal) == 0)
                    {
                        l -= m;
                    }
                }
                m = 1;
                for (; r + m < n && string.Compare(sa.S, sa.SA[r], sa.S, sa.SA[r + m], length, StringComparison.Ordinal) == 0; m <<= 1)
                {
                    r += m;
                }
                for (; m > 0; m >>= 1)
                {
                    if (r + m < n && string.Compare(sa.S, sa.SA[r], sa.S, sa.SA[r + m], length, StringComparison.Ordinal) == 0)
                    {
                        r += m;
                    }
                }
                return(r - l + 1);
            }
 public SentenceSplitter(SuffixArray sa)
 {
     FrequencyPerLength = () => MethodsForSuffixArray.FrequencyPerLength(sa);
     Count          = s => MethodsForSuffixArray.Count(sa, s);
     BaseDataLength = () => sa.S.Length;
     NextChars      = delegate { throw new NotImplementedException(); };
 }
Пример #3
0
 public SATabPage() : base("SA")
 {
     //InitializeComponent();
     TLPmain.Controls.Add(TLPtop, 0, 0);
     {
         TLPtop.Controls.Add(TXBin, 0, 0);
         TLPtop.Controls.Add(new MyPanel()
         {
             Controls = { TLPctrl }, AutoScroll = true, Dock = DockStyle.Top
         }, 1, 0);
         {
             //TLPctrl.SetRowSpan(TXBin, TLPctrl.RowCount);
             int row = 0;
             TLPctrl.Controls.Add(CBmethod, 0, row++);
             {
                 CBmethod.Items.Add("Count Word");
                 CBmethod.Items.Add("List Words");
                 CBmethod.Items.Add("Send Socket");
                 CBmethod.Items.Add("Cut by Code");
             }
             TLPctrl.Controls.Add(BTNexportSA, 0, row++);
             TLPctrl.Controls.Add(BTNsave, 0, row++);
             TLPctrl.Controls.Add(BTNload, 0, row++);
             TLPctrl.Controls.Add(BTNexportList, 0, row++);
             TLPctrl.Controls.Add(BTNnew, 0, row++);
             TLPctrl.Controls.Add(CHBdebugMode, 0, row++);
             TLPctrl.Controls.Add(CHBreplaceWithEmptyExceptChinese, 0, row++);
             TLPctrl.Controls.Add(CHBremoveEmpty, 0, row++);
             TLPctrl.Controls.Add(CHBverbose, 0, row++);
             TLPctrl.Controls.Add(IFdata, 0, row++);
             {
                 IFdata.AddField("maxWordLength", maxWordLength.ToString());
             }
             TLPctrl.Controls.Add(CBprobType, 0, row++);
             {
                 foreach (var s in SentenceSplitter.probTypeString.Split('\n'))
                 {
                     CBprobType.Items.Add(s);
                 }
                 CBprobType.SelectedValueChanged += (sender, e) =>
                 {
                     probType = Enum.GetValues(typeof(SentenceSplitter.ProbTypeEnum)).Cast <SentenceSplitter.ProbTypeEnum>().FirstOrDefault(v => CBprobType.Text.IndexOf($"probType == ProbTypeEnum.{v}") != -1);
                     //MessageBox.Show(probType.ToString());
                 };
             }
             TLPctrl.Controls.Add(CHBsplit, 0, row++);
         }
     }
     TLPmain.Controls.Add(TXBout, 0, 1);
     TLPmain.Controls.Add(TXBdata, 0, 2);
     //TXBdata.TextChanged += TXBdata_TextChanged;
     TXBdata.MouseDoubleClick += TXBdata_MouseDoubleClick;
     TXBin.TextChanged        += TXBin_TextChanged;
     TXBin.KeyDown            += TXBin_KeyDown;
     TXBin.ContextMenu         = new ContextMenu(new[] { new MenuItem("sample code", delegate { TXBin.Text = sampleCode; }) });
     BTNexportSA.Click        += BTNexportSA_Click;
     BTNexportList.Click      += BTNexportList_Click;
     CHBsplit.CheckedChanged  += CHBsplit_CheckedChanged;
     BTNsave.Click            += BTNsave_Click;
     BTNload.Click            += BTNload_Click;
     BTNnew.Click             += BTNnew_Click;
     this.Controls.Add(TLPmain);
     //sam = new SAM();
     //sam.StatusChanged += (s) => { this.Invoke(new Action(() => this.Text = $"[*] {s}")); };
     //sm = new SimpleMethod();
     //sm.StatusChanged += (s) => { this.Invoke(new Action(() => this.Text = $"[*] {s}")); };
     //sa.StatusChanged += (s) => { this.Invoke(new Action(() => this.Text = $"[*] {s}")); };
     sa = new SuffixArray();
     ss = new SentenceSplitter(sa);
     StartServices();
 }
            public static List <FPLtype> FrequencyPerLength(SuffixArray sa)
            {
                Trace.WriteLine("FrequencyPerLength(SuffixArray sa)...");
                try
                {
                    Trace.Indent();
                    if (sa.FPL != null)
                    {
                        return(sa.FPL);
                    }
                    int n = sa.S.Length;
                    Trace.WriteLine("Copying height data...");
                    List <Tuple <int, int> > h = new List <Tuple <int, int> >();
                    for (int i = 1; i < n; i++)
                    {
                        h.Add(new Tuple <int, int>(sa.HEIGHT[i], i));
                    }
                    Trace.WriteLine("Sorting...");
                    //DistributedSort(h, (a, b) => a.Item1.CompareTo(b.Item1));
                    h.Sort((a, b) => a.Item1.CompareTo(b.Item1));
                    List <Tuple <int, int> > changes = new List <Tuple <int, int> >();
                    {
                        Trace.WriteLine("Creating linked list...");
                        int[] linkl = new int[n + 1], linkr = new int[n + 1];
                        for (int i = 0; i < n; i++)
                        {
                            linkl[i + 1] = i;
                            linkr[i]     = i + 1;
                        }
                        Trace.WriteLine("Simulate changes...");
                        int j = n - 2;
                        for (int gram = n; gram >= 1; gram--)
                        {
                            while (j >= 0 && h[j].Item1 >= gram)
                            {
                                int k = h[j].Item2;
                                int l = linkl[k], r = linkr[k];
                                changes.Add(new Tuple <int, int>(k - l, r - k));
                                linkl[r] = l;
                                linkr[l] = r;
                                --j;//j+1 is the num of splittings
                            }
                            changes.Add(new Tuple <int, int>(-1, j));
                        }
                    }
                    Trace.WriteLine("Building ans...");
                    List <FPLtype> ans = new List <FPLtype>();
                    ans.Resize(n + 1, default(FPLtype));
                    //for (int i = 1; i <= n; i++) ans[i] = new FPLtype();
                    Trace.WriteLine("Filling ans...");
                    {
                        long   sp2   = n; //sum of power 2
                        double sp0_5 = n; //sum of power 0.5
                        double slog  = n * Math.Log(2);

                        int[] cnt = new int[n + 1];
                        for (int i = 0; i <= n; i++)
                        {
                            Trace.Assert(cnt[i] == 0);
                        }
                        int cursor = 1, current_count = cnt[1] = n;
                        //System.Windows.Forms.MessageBox.Show("pass");

                        for (int gram = n, c = 0; gram >= 1; gram--)
                        {
                            {
                                int cc = c;
                                while (true)
                                {
                                    var p = changes[cc++];
                                    if (p.Item1 == -1)
                                    {
                                        break;
                                    }
                                    sp2   -= Sq(p.Item1);
                                    sp2   -= Sq(p.Item2);
                                    sp2   += Sq(p.Item1 + p.Item2);
                                    sp0_5 -= Math.Sqrt(p.Item1);
                                    sp0_5 -= Math.Sqrt(p.Item2);
                                    sp0_5 += Math.Sqrt(p.Item1 + p.Item2);
                                    slog  -= Math.Log(p.Item1 + 1);
                                    slog  -= Math.Log(p.Item2 + 1);
                                    slog  += Math.Log(p.Item1 + p.Item2 + 1);
                                    --cnt[p.Item1]; if (p.Item1 <= cursor)
                                    {
                                        --current_count;
                                    }
                                    --cnt[p.Item2]; if (p.Item2 <= cursor)
                                    {
                                        --current_count;
                                    }
                                    ++cnt[p.Item1 + p.Item2]; if (p.Item1 + p.Item2 <= cursor)
                                    {
                                        ++current_count;
                                    }
                                    //Trace.WriteLine($"{p.Item1}\t{p.Item2}");
                                    //Trace.Assert(cnt[p.Item1] >= 0 && cnt[p.Item2] >= 0);
                                }
                            }
                            while (changes[c++].Item1 != -1)
                            {
                                ;
                            }
                            int j = changes[c - 1].Item2;
                            try
                            {
                                while (current_count < 0.01 * (j + 2))
                                {
                                    current_count += cnt[++cursor];
                                }
                                while (current_count - cnt[cursor] >= 0.01 * (j + 2))
                                {
                                    current_count -= cnt[cursor--];
                                }
                            }
                            catch (Exception error) { System.Windows.Forms.MessageBox.Show($"cursor={cursor}, current_count={current_count}, i={gram}, j={j}, n={n}\r\n" + error.ToString()); }
                            double u = (double)n / (j + 2);
                            ans[gram] = new FPLtype
                            {
                                uniqCnt   = j + 2,
                                mean      = u,
                                stderr    = Math.Sqrt((double)sp2 / (j + 2) - u * u),
                                sqrtSum   = sp0_5,
                                logSum    = slog,
                                percent10 = cursor
                            };
                        }
                    }
                    Trace.Write("OK");
                    //System.Windows.Forms.MessageBox.Show(string.Join(", ", ans.GetRange(0, 20)));
                    return(sa.FPL = ans);
                }
                catch (Exception error) { System.Windows.Forms.MessageBox.Show(error.ToString()); throw; }
                finally { Trace.Unindent(); }
            }
 public static int Count(SuffixArray sa, string s)
 {
     return(sa.UpperBound(s) - sa.LowerBound(s));
 }