Ejemplo n.º 1
0
        static void Main(string[] args)
        {
            Stopwatch sw = new Stopwatch();

            sw.Start();

            //untuk true / false

            string[] text = "ini laptop thomas".Split(' ');

            AhoCorasick.Trie <string, bool> trie = new AhoCorasick.Trie <string, bool>();
            trie.Add(new[] { "thomasa" }, true);
            trie.Build();

            /*
             * string[] masuk = "one two three four".Split(' ');
             *
             * AhoCorasick.Trie<string, bool> trie = new AhoCorasick.Trie<string, bool>();
             *
             * string text = System.IO.File.ReadAllText(@"C:\Users\Thomas Yap\Documents\CUDA Examples\testaho\lorem 1000words.txt");
             *
             * trie.Build();
             */
            Console.WriteLine(trie.Find(text).Any());


            sw.Stop();

            Console.WriteLine("Elapsed={0}", sw.Elapsed);
            Console.ReadKey();
        }
Ejemplo n.º 2
0
        public void LineNumbers()
        {
            string textfile = "";

            using (StreamReader fs = new StreamReader(@"C:\Новая папка\1.txt"))
            {
                while (true)
                {
                    string temp = fs.ReadLine();
                    if (temp == null)
                    {
                        break;
                    }
                    textfile += temp;
                }
            }
            string[] words = new[] { "hello", "word" };

            AhoCorasick.Trie <int> trie = new AhoCorasick.Trie <int>();
            for (int i = 0; i < words.Length; i++)
            {
                trie.Add(words[i], i);
            }
            trie.Build();

            int[] lines = trie.Find(textfile).ToArray();

            Assert.AreEqual(5, lines.Length);
            // Assert.AreEqual(1, lines[0]);
            // Assert.AreEqual(1, lines[1]);
        }
Ejemplo n.º 3
0
        public void findsome()
        {
            string textfile = "";

            using (StreamReader fs = new StreamReader(@"C:\Новая папка\1.txt"))
            {
                while (true)
                {
                    string temp = fs.ReadLine();
                    if (temp == null)
                    {
                        break;
                    }
                    textfile += temp;
                }
            }

            AhoCorasick.Trie trie = new AhoCorasick.Trie();
            trie.Add("hello");
            // trie.Add("word");
            trie.Build();

            string[] matches = trie.Find(textfile).ToArray();

            Assert.AreEqual(5, matches.Length);
            Assert.AreEqual("hello", matches[0]);
            // Assert.AreEqual("hellonull",matches[1]);
            // Assert.AreEqual("word", matches[1]);
        }
            public static List <string> Match(AhoCorasick.Trie trie, List <string> hashtags, string text)
            {
                List <int> positions = new List <int>();

                foreach (string position in trie.Find(text))
                {
                    positions.Add(Int16.Parse(position));
                }

                //check if it is non alfa char
                var verifiedWords = new List <string>();

                if (positions.Count() == 0)
                {
                    return(verifiedWords);
                }

                foreach (int wordNo in positions)
                {
                    string word             = hashtags.ElementAt(wordNo);
                    int    startingPosition = text.IndexOf(word);
                    int    endingPosition   = startingPosition + word.Count();

                    //If beggining or end of text, assume its not english letter
                    bool front = startingPosition == 0 || (startingPosition != -1 && !Search.IsEnglishLetter(text.ElementAt(startingPosition - 1)));

                    bool end = endingPosition == text.Count() || (endingPosition != -1 && !Search.IsEnglishLetter(text.ElementAt(endingPosition)));

                    if (front && end)
                    {
                        verifiedWords.Add(word);
                    }
                }
                return(verifiedWords);
            }
Ejemplo n.º 5
0
        public void Words()
        {
            string[] text = "hello:hello:wor:ddsdsdf:word:hello".Split(':');

            AhoCorasick.Trie <string, bool> trie = new AhoCorasick.Trie <string, bool>();
            trie.Add(new[] { "wol" }, true);
            trie.Build();

            Assert.IsFalse(trie.Find(text).Any());
        }
Ejemplo n.º 6
0
        public void Words()
        {
            string[] text = "one two three four".Split(' ');

            AhoCorasick.Trie <string, bool> trie = new AhoCorasick.Trie <string, bool>();
            trie.Add(new[] { "three", "four" }, true);
            trie.Build();

            Assert.IsTrue(trie.Find(text).Any());
        }
Ejemplo n.º 7
0
        public void Contains()
        {
            string text = "hello and welcome to this beautiful world!";

            AhoCorasick.Trie trie = new AhoCorasick.Trie();
            trie.Add("hello");
            trie.Add("world");
            trie.Build();

            Assert.IsTrue(trie.Find(text).Any());
        }
Ejemplo n.º 8
0
        public void Contains()
        {
            string text = "hello and welcome to this beautiful world!";

            AhoCorasick.Trie trie = new AhoCorasick.Trie();
            trie.Add("hello");
            trie.Add("world");
            trie.Build();

            Assert.IsTrue(trie.Find(text).Any());
        }
Ejemplo n.º 9
0
        public void HelloWorld()
        {
            string text = "hello and welcome to this beautiful world!";

            AhoCorasick.Trie trie = new AhoCorasick.Trie();
            trie.Add("hello");
            trie.Add("world");
            trie.Build();

            string[] matches = trie.Find(text).ToArray();

            Assert.AreEqual(2, matches.Length);
            Assert.AreEqual("hello", matches[0]);
            Assert.AreEqual("world", matches[1]);
        }
Ejemplo n.º 10
0
        public void HelloWorld()
        {
            string text = "hello and welcome to this beautiful wo5rld!";

            AhoCorasick.Trie trie = new AhoCorasick.Trie();
            trie.Add("hello");
            trie.Add("worldddddd");
            trie.Build();

            string[] matches = trie.Find(text).ToArray();

            Assert.AreEqual(2, matches.Length);
            Assert.AreEqual("hello", matches[0]);
            Assert.AreEqual("world5dd", matches[1]);
        }
Ejemplo n.º 11
0
        public void HelloWorld()
        {
            string text = "hello and welcome to this beautiful world!";

            var trie = new AhoCorasick.Trie();

            trie.Add("hello");
            trie.Add("world");
            trie.Build();

            var matches = trie.Find(text).ToArray();

            Assert.AreEqual(2, matches.Length);
            Assert.AreEqual(Tuple.Create("hello", 4), matches[0]);
            Assert.AreEqual(Tuple.Create("world", 40), matches[1]);
        }
Ejemplo n.º 12
0
        public void LineNumbers()
        {
            string text = "world, i hello you!";
            string[] words = new[] { "hello", "world" };

            AhoCorasick.Trie<int> trie = new AhoCorasick.Trie<int>();
            for (int i = 0; i < words.Length; i++)
                trie.Add(words[i], i);
            trie.Build();

            int[] lines = trie.Find(text).ToArray();

            Assert.AreEqual(2, lines.Length);
            Assert.AreEqual(1, lines[0]);
            Assert.AreEqual(0, lines[1]);
        }
Ejemplo n.º 13
0
        public void WordsAndIds()
        {
            string text = "hello and welcome to this beautiful world!";

            var trie = new AhoCorasick.Trie <Tuple <string, int> >();

            trie.Add("hello", Tuple.Create("hello", 123));
            trie.Add("world", Tuple.Create("world", 456));

            trie.Build();

            var matches = trie.Find(text).ToArray();

            Assert.AreEqual(2, matches.Length);
            Assert.AreEqual(Tuple.Create(Tuple.Create("hello", 123), 4), matches[0]);
            Assert.AreEqual(Tuple.Create(Tuple.Create("world", 456), 40), matches[1]);
        }
Ejemplo n.º 14
0
        // remember to check for the same key added before adding when counting words!
        public static void Main()
        {
            AhoCorasick.Trie trie = new AhoCorasick.Trie();

            // add words
            trie.Add("hello");
            trie.Add("world");

            // build search tree
            trie.Build();

            string text = "hello and welcome to this beautiful world world hello!";

            // find words
            foreach (string word in trie.Find(text))
            {
                Console.WriteLine(word);
            }
        }
Ejemplo n.º 15
0
        public void LineNumbers()
        {
            string text = "world, i hello you!";

            string[] words = new[] { "hello", "world" };

            AhoCorasick.Trie <int> trie = new AhoCorasick.Trie <int>();
            for (int i = 0; i < words.Length; i++)
            {
                trie.Add(words[i], i);
            }
            trie.Build();

            int[] lines = trie.Find(text).ToArray();

            Assert.AreEqual(2, lines.Length);
            Assert.AreEqual(1, lines[0]);
            Assert.AreEqual(0, lines[1]);
        }
Ejemplo n.º 16
0
        public void StartSearch([FromBody] ItemGroup data)
        {
            var path = data.Path;

            AhoCorasick.Trie trie = new AhoCorasick.Trie();

            List <string> hashtags = data.Tags.Text as List <string>;

            hashtags = hashtags.ConvertAll(d => d.ToLower());

            for (int i = 0; i < hashtags.Count; i++)
            {
                trie.Add(hashtags[i], (i).ToString());
            }

            // build search tree
            trie.Build();
            Console.WriteLine("Trie built.");

            Search.Match(path, trie, hashtags);
        }
        public JsonResult StartModeling(string filePath)
        {
            var fileName = this.Request.Query.ElementAt(0).Key;

            ConcurrentDictionary <string, Dictionary <string, int> > tweetList = new ConcurrentDictionary <string, Dictionary <string, int> >();


            AhoCorasick.Trie trie = new AhoCorasick.Trie();

            List <string> hashtags = new List <string>()
            {
                "alwaystrump",
                "babesfortrump",
                "bikers4trump",
                "bikersfortrump",
                "blacks4trump",
                "buildthatwall",
                "buildthewall",
                "cafortrump",
                "democrats4trump",
                "donuldtrumpforpresident",
                "feelthetrump",
                "femineamerica4trump",
                "gays4trump",
                "gaysfortrump",
                "gotrump",
                "heswithus",
                "imwithhim",
                "imwithyou",
                "latinos4trump",
                "latinosfortrump",
                "maga",
                "makeamericagreat",
                "makeamericagreatagain",
                "makeamericasafeagain",
                "makeamericaworkagain",
                "onlytrump",
                "presienttrump",
                "rednationrising",
                "trump16",
                "trump2016",
                "trumpcares",
                "trumpforpresident",
                "trumpiswithyou",
                "trumppence2016",
                "trumpstrong",
                "trumptrain",
                "veteransfortrump",
                "vets4trump",
                "votegop",
                "votetrump",
                "votetrump2016",
                "votetrumppence2016",
                "woman4trump",
                "women4trump",
                "womenfortrump",
                "antitrump",
                "anyonebuttrump",
                "boycotttrump",
                "chickentrump",
                "clowntrain",
                "crookeddonald",
                "crookeddrumpf",
                "crookedtrump",
                "crybabytrump",
                "defeattrump",
                "dirtydonald",
                "donthecon",
                "drumpf",
                "dumbdonald",
                "dumpthetrump",
                "dumptrump",
                "freethedelegates",
                "lgbthatestrumpparty",
                "loserdonald",
                "losertrump",
                "lovetrumpshate",
                "lovetrumpshates",
                "lyindonald",
                "lyingdonald",
                "lyingtrump",
                "lyintrump",
                "makedonalddrumpfagain",
                "nevergop",
                "nevertrump",
                "nevertrumppence",
                "nodonaldtrump",
                "notrump",
                "notrumpanytime",
                "poordonald",
                "racisttrump",
                "releasethereturns",
                "releaseyourtaxes",
                "ripgop",
                "showusyourtaxes",
                "sleazydonald",
                "stoptrump",
                "stupidtrump",
                "traitortrump",
                "treasonoustrump",
                "trump20never",
                "trumplies",
                "trumpliesmatter",
                "trumpsopoor",
                "trumpthefraud",
                "trumptrainwreck",
                "trumptreason",
                "unfittrump",
                "weakdonald",
                "wherertrumpstaxes",
                "wheresyourtaxes",
                "whinylittlebitch",
                "womentrumpdonald",
                "bernwithher",
                "bluewave2016",
                "clintonkaine2016",
                "estoyconella",
                "herstory",
                "heswithher",
                "hillafornia",
                "hillary2016",
                "hillaryforamerica",
                "hillaryforpr",
                "hillaryforpresident",
                "hillarysopresidential",
                "hillarysoqualified",
                "hillarystrong",
                "hillstorm2016",
                "hillyes",
                "hrc2016",
                "hrcisournominee",
                "iamwithher",
                "imwither",
                "imwithher",
                "imwithher2016",
                "imwhithhillary",
                "imwiththem",
                "itrusther",
                "itrusthillary",
                "madamepresident",
                "madampresident",
                "momsdemandhillary",
                "ohhillyes",
                "readyforhillary",
                "republicans4hillary",
                "republicansforhillary",
                "sheswithus",
                "standwithmadampotus",
                "strongertogether",
                "uniteblue",
                "vote4hillary",
                "voteblue",
                "voteblue2016",
                "votehillary",
                "welovehillary",
                "yeswekaine",
                "clintoncorruption",
                "clintoncrime",
                "clintoncrimefamily",
                "clintoncrimefoundation",
                "corrupthillary",
                "criminalhillary",
                "crookedclinton",
                "crookedclintons",
                "crookedhilary",
                "crookedhiliary",
                "crookedhillary",
                "crookedhillaryclinton",
                "deletehillary",
                "dropouthillary",
                "fbimwithher",
                "handcuffhillary",
                "heartlesshillary",
                "hillary2jail",
                "hillary4jail",
                "hillary4prison",
                "hillary4prison2016",
                "hillaryforprison",
                "hillaryforprison2016",
                "hillaryliedpeopledied",
                "hillarylies",
                "hillaryliesmatter",
                "hillarylosttome",
                "hillaryrottenclinton",
                "hillarysolympics",
                "hillno",
                "hypocritehillary",
                "imnotwithher",
                "indicthillary",
                "iwillneverstandwithher",
                "killary",
                "lockherup",
                "lyingcrookedhillary",
                "lyinghillary",
                "lyinhillary",
                "moretrustedthanhillary",
                "neverclinton",
                "nevereverhillary",
                "neverhillary",
                "neverhilllary",
                "nohillary2016",
                "nomoreclintons",
                "notwithher",
                "ohhillno",
                "releasethetranscripts",
                "riskyhillary",
                "shelies",
                "sickhillary",
                "stophillary",
                "stophillary2016",
                "theclintoncontamination",
                "wehatehillary",
                "whatmakeshillaryshortcircuit"
            };

            hashtags = hashtags.ConvertAll(d => d.ToLower());

            for (int i = 0; i < hashtags.Count; i++)
            {
                trie.Add(hashtags[i], (i).ToString());
            }

            // build search tree
            trie.Build();

            Parallel.ForEach(System.IO.File.ReadLines(fileName), new ParallelOptions {
                MaxDegreeOfParallelism = 32
            }, (line, _, lineNumber) =>
            {
                try
                {
                    var tweet = JsonConvert.DeserializeObject <_Tweet>(line);
                    var a     = Proximity.Match(trie, hashtags, tweet.Text.ToLower());
                    if (a.Count != 0)
                    {
                        Dictionary <string, List <string> > tags = new Dictionary <string, List <string> >()
                        {
                            {
                                "Pro-Trump", new List <string>()
                                {
                                    "trump",
                                    "clinton",
                                    "alwaystrump",
                                    "babesfortrump",
                                    "bikers4trump",
                                    "bikersfortrump",
                                    "blacks4trump",
                                    "buildthatwall",
                                    "buildthewall",
                                    "cafortrump",
                                    "democrats4trump",
                                    "donuldtrumpforpresident",
                                    "feelthetrump",
                                    "femineamerica4trump",
                                    "gays4trump",
                                    "gaysfortrump",
                                    "gotrump",
                                    "heswithus",
                                    "imwithhim",
                                    "imwithyou",
                                    "latinos4trump",
                                    "latinosfortrump",
                                    "maga",
                                    "makeamericagreat",
                                    "makeamericagreatagain",
                                    "makeamericasafeagain",
                                    "makeamericaworkagain",
                                    "onlytrump",
                                    "presienttrump",
                                    "rednationrising",
                                    "trump16",
                                    "trump2016",
                                    "trumpcares",
                                    "trumpforpresident",
                                    "trumpiswithyou",
                                    "trumppence2016",
                                    "trumpstrong",
                                    "trumptrain",
                                    "veteransfortrump",
                                    "vets4trump",
                                    "votegop",
                                    "votetrump",
                                    "votetrump2016",
                                    "votetrumppence2016",
                                    "woman4trump",
                                    "women4trump",
                                    "womenfortrump"
                                }
                            }, {
                                "Anti-Trump", new List <string>()
                                {
                                    "antitrump",
                                    "anyonebuttrump",
                                    "boycotttrump",
                                    "chickentrump",
                                    "clowntrain",
                                    "crookeddonald",
                                    "crookeddrumpf",
                                    "crookedtrump",
                                    "crybabytrump",
                                    "defeattrump",
                                    "dirtydonald",
                                    "donthecon",
                                    "drumpf",
                                    "dumbdonald",
                                    "dumpthetrump",
                                    "dumptrump",
                                    "freethedelegates",
                                    "lgbthatestrumpparty",
                                    "loserdonald",
                                    "losertrump",
                                    "lovetrumpshate",
                                    "lovetrumpshates",
                                    "lyindonald",
                                    "lyingdonald",
                                    "lyingtrump",
                                    "lyintrump",
                                    "makedonalddrumpfagain",
                                    "nevergop",
                                    "nevertrump",
                                    "nevertrumppence",
                                    "nodonaldtrump",
                                    "notrump",
                                    "notrumpanytime",
                                    "poordonald",
                                    "racisttrump",
                                    "releasethereturns",
                                    "releaseyourtaxes",
                                    "ripgop",
                                    "showusyourtaxes",
                                    "sleazydonald",
                                    "stoptrump",
                                    "stupidtrump",
                                    "traitortrump",
                                    "treasonoustrump",
                                    "trump20never",
                                    "trumplies",
                                    "trumpliesmatter",
                                    "trumpsopoor",
                                    "trumpthefraud",
                                    "trumptrainwreck",
                                    "trumptreason",
                                    "unfittrump",
                                    "weakdonald",
                                    "wherertrumpstaxes",
                                    "wheresyourtaxes",
                                    "whinylittlebitch",
                                    "womentrumpdonald"
                                }
                            }, {
                                "Pro-Clinton", new List <string>()
                                {
                                    "bernwithher",
                                    "bluewave2016",
                                    "clintonkaine2016",
                                    "estoyconella",
                                    "herstory",
                                    "heswithher",
                                    "hillafornia",
                                    "hillary2016",
                                    "hillaryforamerica",
                                    "hillaryforpr",
                                    "hillaryforpresident",
                                    "hillarysopresidential",
                                    "hillarysoqualified",
                                    "hillarystrong",
                                    "hillstorm2016",
                                    "hillyes",
                                    "hrc2016",
                                    "hrcisournominee",
                                    "iamwithher",
                                    "imwither",
                                    "imwithher",
                                    "imwithher2016",
                                    "imwhithhillary",
                                    "imwiththem",
                                    "itrusther",
                                    "itrusthillary",
                                    "madamepresident",
                                    "madampresident",
                                    "momsdemandhillary",
                                    "ohhillyes",
                                    "readyforhillary",
                                    "republicans4hillary",
                                    "republicansforhillary",
                                    "sheswithus",
                                    "standwithmadampotus",
                                    "strongertogether",
                                    "uniteblue",
                                    "vote4hillary",
                                    "voteblue",
                                    "voteblue2016",
                                    "votehillary",
                                    "welovehillary",
                                    "yeswekaine",
                                }
                            }, {
                                "Anti-Clinton", new List <string>()
                                {
                                    "clintoncorruption",
                                    "clintoncrime",
                                    "clintoncrimefamily",
                                    "clintoncrimefoundation",
                                    "corrupthillary",
                                    "criminalhillary",
                                    "crookedclinton",
                                    "crookedclintons",
                                    "crookedhilary",
                                    "crookedhiliary",
                                    "crookedhillary",
                                    "crookedhillaryclinton",
                                    "deletehillary",
                                    "dropouthillary",
                                    "fbimwithher",
                                    "handcuffhillary",
                                    "heartlesshillary",
                                    "hillary2jail",
                                    "hillary4jail",
                                    "hillary4prison",
                                    "hillary4prison2016",
                                    "hillaryforprison",
                                    "hillaryforprison2016",
                                    "hillaryliedpeopledied",
                                    "hillarylies",
                                    "hillaryliesmatter",
                                    "hillarylosttome",
                                    "hillaryrottenclinton",
                                    "hillarysolympics",
                                    "hillno",
                                    "hypocritehillary",
                                    "imnotwithher",
                                    "indicthillary",
                                    "iwillneverstandwithher",
                                    "killary",
                                    "lockherup",
                                    "lyingcrookedhillary",
                                    "lyinghillary",
                                    "lyinhillary",
                                    "moretrustedthanhillary",
                                    "neverclinton",
                                    "nevereverhillary",
                                    "neverhillary",
                                    "neverhilllary",
                                    "nohillary2016",
                                    "nomoreclintons",
                                    "notwithher",
                                    "ohhillno",
                                    "releasethetranscripts",
                                    "riskyhillary",
                                    "shelies",
                                    "sickhillary",
                                    "stophillary",
                                    "stophillary2016",
                                    "theclintoncontamination",
                                    "wehatehillary",
                                    "whatmakeshillaryshortcircuit"
                                }
                            }
                        };

                        Dictionary <string, int> similarity = new Dictionary <string, int>();

                        foreach (KeyValuePair <string, List <string> > pair in tags)
                        {
                            similarity.Add(pair.Key, 0);
                        }
                        int total = 0;

                        foreach (string word in a)
                        {
                            foreach (KeyValuePair <string, List <string> > category in tags)
                            {
                                if (category.Value.Contains(word))
                                {
                                    similarity[category.Key]++;
                                    total++;
                                }
                            }
                        }

                        Dictionary <string, int> similarityPercentage = new Dictionary <string, int>();

                        foreach (KeyValuePair <string, int> key in similarity)
                        {
                            similarityPercentage.Add(key.Key, (int)(((double)key.Value / total) * 100));
                        }
                        if (a.Count > 1)
                        {
                            Console.WriteLine();
                        }

                        tweetList.TryAdd(tweet.TimestampMs, similarityPercentage);
                    }
                }
                catch (Exception e)
                {
                    Console.WriteLine("Error ");
                }
            });

            using (StreamWriter sw = new StreamWriter(Path.GetDirectoryName(fileName) + "/SIMILARITY_" + Path.GetFileNameWithoutExtension(fileName) + ".txt"))
            {
                sw.WriteLine(JsonConvert.SerializeObject(tweetList));
            }

            Console.WriteLine("Finished Vader " + DateTime.Now);

            return(new JsonResult(new { success = true, responseText = "Finished Modeling." }));
        }
Ejemplo n.º 18
0
        public void Words()
        {
            string[] text = "one two three four".Split(' ');

            AhoCorasick.Trie<string, bool> trie = new AhoCorasick.Trie<string, bool>();
            trie.Add(new[] { "three", "four" }, true);
            trie.Build();

            Assert.IsTrue(trie.Find(text).Any());
        }
Ejemplo n.º 19
0
        private void ValidateSearchOptions()
        {
            SetStatus("Validating Search Parameters");

            if (_lstSearchPaths.Count == 0)
            {
                _lstErrors.Add("Please add one or more paths to search.");
            }
            else
            {
                foreach (string p in _lstSearchPaths)
                {
                    if (!Directory.Exists(p))
                    {
                        _lstErrors.Add("The directory '" + p + "' does not exist.");
                    }
                }
            }

            //Reset the trie
            _objFileContentsSearchTrie = null;

            //Begin by caching search tokens if needed
            if (_chkFileContentsRegex.Checked)
            {
                _lstFileContentsSearchTokens = null;
                try
                {
                    Regex.Match("", _cboFileContents.Text);
                }
                catch (ArgumentException)
                {
                    _lstErrors.Add("File contents regex is not a valid regular expression.");
                }
            }
            else
            {
                _strFileContentsSearch       = _cboFileContents.Text;
                _lstFileContentsSearchTokens = _cboFileContents.Text.Split(' ').ToList();
                if (_chkFileContentsCaseSensitive.Checked)
                {
                    _lstFileContentsSearchTokens.ConvertAll(x => x.ToLower());;
                }


                // build file contents search tree
                _objFileContentsSearchTrie = new AhoCorasick.Trie();
                int iWord = 0;
                foreach (string word in _lstFileContentsSearchTokens)
                {
                    iWord++;
                    if (_chkFileContentsCaseSensitive.Checked)
                    {
                        _objFileContentsSearchTrie.Add(word);
                    }
                    else
                    {
                        _objFileContentsSearchTrie.Add(word.ToLower());
                    }
                }
                _objFileContentsSearchTrie.Build();
            }
            if (_chkFilenameRegex.Checked)
            {
                _lstFileNameSearchTokens = null;
                try
                {
                    Regex.Match("", _cboFileName.Text);
                }
                catch (ArgumentException)
                {
                    _lstErrors.Add("File name regex is not a valid regular expression.");
                }
            }
            else
            {
                _lstFileNameSearchTokens = _cboFileName.Text.Split(' ').ToList();

                if (_chkFilenameCaseSensitive.Checked)
                {
                    _lstFileNameSearchTokens.ConvertAll(x => x.ToLower());;
                }
            }
        }
Ejemplo n.º 20
0
            /// <summary>
            ///
            /// </summary>
            /// <param name="path"></param>
            /// <param name="trie"></param>
            /// <param name="capitalizedHashtags"></param>
            /// <param name="hashtags"></param>
            public static void Match(string path, AhoCorasick.Trie trie, List <string> hashtags)
            {
                DirectoryInfo rootFolder = new DirectoryInfo(path);
                var           files      = rootFolder.EnumerateFiles("*.json", SearchOption.AllDirectories);

                tags = hashtags.ToDictionary(x => x, x => 0);

                Parallel.ForEach(files, new ParallelOptions {
                    MaxDegreeOfParallelism = 16
                }, (file1) =>
                {
                    int found   = 0;
                    int matched = 0;

                    Console.WriteLine("Reading " + file1.FullName);

                    var jsonText = System.IO.File.ReadAllText(file1.FullName);
                    IList <_Tweet> tweets;

                    try
                    {
                        tweets = JsonConvert.DeserializeObject <IList <_Tweet> >(jsonText);
                    } catch (Exception E)
                    {
                        return;
                    }

                    var matchedTweets = new List <string>();
                    for (var i = 0; i < tweets.Count; i++)
                    {
                        String text = tweets[i].Text.ToLower();

                        if (tweets[i] == null)
                        {
                            continue;
                        }

                        List <int> positions = new List <int>();

                        foreach (string position in trie.Find(text))
                        {
                            positions.Add(Int16.Parse(position));
                        }

                        found++;

                        if (positions.Count() == 0)
                        {
                            continue;
                        }

                        //check if it is non alfa char
                        var verifiedWords = new List <string>();

                        foreach (int wordNo in positions)
                        {
                            string word          = hashtags.ElementAt(wordNo);
                            int startingPosition = text.IndexOf(word);
                            int endingPosition   = startingPosition + word.Count();

                            //If beggining or end of text, assume its not english letter
                            bool front = startingPosition == 0 || (startingPosition != -1 && !IsEnglishLetter(text.ElementAt(startingPosition - 1)));

                            bool end = endingPosition == text.Count() || (endingPosition != -1 && !IsEnglishLetter(text.ElementAt(endingPosition)));

                            if (front && end)
                            {
                                verifiedWords.Add(word);
                            }
                            else
                            {
                                continue;
                            }
                        }
                        if (verifiedWords.Count == 0)
                        {
                            continue;
                        }

                        matched++;
                        matchedTweets.Add(JsonConvert.SerializeObject(tweets[i]));

                        //Check if what happens here is correct
                        foreach (string word in verifiedWords)
                        {
                            lock (keywordLock)
                            {
                                tags[word] += 1;
                            }
                        }
                    }

                    Interlocked.Add(ref TweetsFound, found);
                    Interlocked.Add(ref TweetsMatched, matched);
                    System.IO.File.WriteAllLines(file1.DirectoryName + "/Matched" + file1.Name + ".txt", matchedTweets);
                });

                bool mergeIntoOneFile = true;

                if (mergeIntoOneFile)
                {
                    MergeSearchResults(path, "output.txt");
                }


                using (StreamWriter sw = new StreamWriter(path + "/Search_stats.txt"))
                {
                    sw.Write(JsonConvert.SerializeObject(new Stats
                    {
                        Found   = TweetsFound,
                        Matched = TweetsMatched,
                        Results = tags
                    }, Formatting.Indented));
                    // sw.Write(JsonConvert.SerializeObject(objectToSerialize, Formatting.Indented));
                    Interlocked.Increment(ref TweetsFound);
                }
            }