コード例 #1
0
ファイル: MainForm.cs プロジェクト: boogop/NaiveStringSearch
        private void openFile(string filename)
        {
            string name = chkNull.whenNull(filename);

            if (chkNull.isNull(filename))
            {
                openFileDialog1 = new OpenFileDialog();
                openFileDialog1.RestoreDirectory = false;
                if (openFileDialog1.ShowDialog() != DialogResult.OK)
                {
                    return;
                }
                name = openFileDialog1.FileName;
            }

            if (chkNull.isNull(name))
            {
                return;
            }

            // move all var decs out of the loop
            string r, s, t, raw;

            r = s = t = raw = "";
            bool isWord, skip;  // bools are initialized to false per C# spec
            byte bi;            // probably should initialize these but eh

            byte[]        bytes;
            List <string> strings;
            int           chunksize  = 20;
            int           iIncrement = chunksize - 10;

            this.Text = "Naive String Search [" + name + "]";

            FileInfo f = new FileInfo(name);

            // show size
            double len = 0;

            if (f.Length > 0)
            {
                len = Math.Round((chkNull.numNull(f.Length) / 1024 / 1024), 2);
            }

            // show file info
            lblSize.Text     = len.ToString() + "MB";
            lblCreated.Text  = f.CreationTime.ToShortDateString();
            lblAccessed.Text = f.LastAccessTime.ToShortDateString();

            txtText.Text = "";

            Files fs = new Files();

            fs.processFile(name);

            // pop text & raw bytes
            strings = new List <string>();
            raw     = fs.theText;
            bytes   = fs.theBytes;

            pb1.Maximum = raw.Length / iIncrement + chunksize;

            // my understanding is each time through a loop the end condition is evaluated,
            // so if you have any arithmetic ops in the end condition those are eval'd every time
            int end = raw.Length - chunksize;

            for (int i = 0; i < end; i += iIncrement)
            {
                // each time through, select a chunk of n size. Increment i
                // by 10 since any word crossing the chunksize is unlikely to
                // have more than 10 chars in the preceeding chunk (at least
                // I'm willing to make that assumption for this project)

                s = GeneralTools.Mid(raw, i, chunksize);

                for (int j = 0; j < s.Length; j++)
                {
                    // track the decimal value of the char in the bytes[] array we exported from fs
                    bi = bytes[i + j];
                    // in the ascii table alpha chars are between 65 & 122, space is at 32
                    skip = (bi < 65 || bi > 122) && bi != 32;
                    if (skip)
                    {
                        continue;
                    }

                    /*
                     * We need to scan each chunk for words. To do that we have to decrease
                     * the size of the chunk until a word is recognizable
                     *
                     * string t =
                     * [left] _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ [right]
                     *        x x x x x <--- word here
                     *
                     * If the above is the chunk, we'll look for words beginning on the left end.
                     * Loop k decrements the chunk from the right by one, so shrinks the chunk until
                     * the word at x becomes recognizable. Loop k only moves the right end
                     *
                     * Loop j decrements the chunk from the left by one and repeats loop k which should
                     * let us recognize words in the middle or end. Loop j only moves the left end
                     *
                     * string t =
                     *                   [left] _ _ _ _ _ _ _ _ _ _ _ [right]
                     *                          x x x x x <--- word here
                     *
                     * So if the above represents string t after a few iterations of j, loop k
                     * will shrink t until
                     *
                     * string r =
                     *                   [left] _ _ _ _ _ [right]
                     *                          x x x x x <--- word here
                     *
                     * and NetSpell picks up the word
                     */

                    // we skip non-alpha chars which increments j so this should still work
                    // select from the right which shortens the string from the left
                    t = GeneralTools.Right(s, s.Length - j);

                    for (int k = 0; k < t.Length; k++)
                    {
                        // select from the left which shortens the string from the right
                        r = GeneralTools.Left(t, t.Length - k);
                        // NetSpell picks up way too many rando 2 letter combinations as words
                        if (r.Length < 3)
                        {
                            break;
                        }
                        // let NetSpell figure out if we've found a word
                        isWord = nS.TestWord(r.ToLower());

                        if (isWord)
                        {
                            //if (r.ToLower() == "resource")
                            //{
                            //    // I was watching the behavior of fileassassin-setup-1.06.exe which
                            //    // was somehow turning resource into resource + sour + res
                            //    // still haven't figured out how it's doing that
                            //    System.Diagnostics.Debug.Assert(false);
                            //}
                            strings.Add(r);
                            // bail from the loop since we've shrunk t to the leftmost word. We don't
                            // want it to identify 'model' as model + mode + mod although this
                            // doesn't seem to be working the way I think it should
                            break;
                        }
                    }
                }

                pb1.Value++;
                Application.DoEvents();
            }

            for (int i = 0; i < strings.Count; i++)
            {
                // comment this out to see all the dups
                if (!txtText.Text.ToUpper().Contains(strings[i].ToUpper()))
                {
                    txtText.Text += strings[i] + " ";
                }
            }

            lblCount.Text = strings.Count.ToString();
            pb1.Value     = 0;
        }