Example #1
0
        // static void GetIndex(
        //     string indexerPath,
        //     out PatriciaSuffixTrie<string> trie,
        //     out SymSpell symSpell,
        //     out Dictionary<int, HashSet<int>> inverter,
        //     out Dictionary<string, int> dict,
        //     List<string> documents
        // ){

        // }
        static long SaveIndex(string indexerPath, PatriciaSuffixTrie <string> trie,
                              SymSpell symSpell, Dictionary <int, HashSet <int> > inverter, Dictionary <string, int> dict, List <string> documents)
        {
            long fileSize = 0;

            using (var fs = new FileStream(indexerPath, FileMode.Create))
                using (var sw = new BinaryWriter(fs, Encoding.UTF8)) {
                    foreach (var entry in dict)
                    {
                        sw.Write(BitConverter.GetBytes(entry.Key.Length));
                        sw.Write(Encoding.ASCII.GetBytes(entry.Key));

                        fileSize += 4 + entry.Key.Length; // 4 + wordLength bytes
                    }

                    foreach (var phrase in documents)
                    {
                        var tokens = phrase.Split(' ');
                        sw.Write(BitConverter.GetBytes(tokens.Length));
                        fileSize += (tokens.Length + 1) * 4; // 4 + tokens.Length * 4
                        foreach (var token in tokens)
                        {
                            var tokenId = dict[token];
                            sw.Write(BitConverter.GetBytes(tokenId));
                        }
                    }

                    foreach (var entry in inverter)
                    {
                        fileSize += 8 + entry.Value.Count * 4;
                        sw.Write(BitConverter.GetBytes(entry.Key));
                        sw.Write(BitConverter.GetBytes(entry.Value.Count));
                        foreach (var referenceIndex in entry.Value)
                        {
                            sw.Write(BitConverter.GetBytes(referenceIndex));
                        }
                    }
                }
            return(fileSize);
        }
Example #2
0
        static void Main(string[] args)
        {
            var path        = AppDomain.CurrentDomain.BaseDirectory + "real-suggests.txt";
            var indexerPath = AppDomain.CurrentDomain.BaseDirectory + "real-suggests.index";

            // var path = AppDomain.CurrentDomain.BaseDirectory + "all-suggests-cleaned.txt";
            // var indexerPath = AppDomain.CurrentDomain.BaseDirectory + "all-suggests-cleaned.index";
            // var path = AppDomain.CurrentDomain.BaseDirectory + "small-suggests.txt";
            // var indexerPath = AppDomain.CurrentDomain.BaseDirectory + "small-suggests.index";
            var dict      = new Dictionary <string, int>();
            var frequency = new Dictionary <string, int>();
            var inverter  = new Dictionary <int, HashSet <int> >();
            var documents = new List <string>();

            // try indexing by using invert index
            var count       = 0;
            var stringIndex = 0;

            Console.WriteLine("Reading and indexing ...");
            long      memSize   = GC.GetTotalMemory(true);
            Stopwatch stopWatch = new Stopwatch();

            stopWatch.Start();

            var trie = new PatriciaSuffixTrie <string>(1);

            using (var sr = new StreamReader(path)) {
                string s = null;
                while ((s = sr.ReadLine()) != null)
                {
                    documents.Add(s);
                    foreach (var word in s.Split(' '))
                    {
                        int tmp;
                        if (!dict.TryGetValue(word, out tmp))
                        {
                            dict[word]      = ++count;
                            frequency[word] = 1;
                        }
                        else
                        {
                            frequency[word]++;
                        }

                        HashSet <int> tmpStringIndexes;
                        if (!inverter.TryGetValue(dict[word], out tmpStringIndexes))
                        {
                            tmpStringIndexes = new HashSet <int>();
                            inverter[count]  = tmpStringIndexes;
                        }
                        tmpStringIndexes.Add(stringIndex);
                    }
                    stringIndex++;
                }
            }

            Console.WriteLine("Adding to completion dict...");
            memSize = GC.GetTotalMemory(true);
            foreach (var entry in dict)
            {
                trie.Add(entry.Key, entry.Key);
            }
            Console.WriteLine("Add to completion dict: " + ((GC.GetTotalMemory(true) - memSize) / 1024 / 1024.0).ToString("N0") + " MB. Token count: " + dict.Count);

            stopWatch = new Stopwatch();
            stopWatch.Start();
            Console.WriteLine("Adding to symSpell for fast spellCheck");
            memSize = GC.GetTotalMemory(true);
            // dictionary for symspell
            var spellChecker = new SymSpell(dict.Count, 2);

            foreach (var entry in frequency)
            {
                spellChecker.CreateDictionaryEntry(entry.Key, entry.Value);
            }


            Console.WriteLine("Spell dictionary constructed. " + ((GC.GetTotalMemory(true) - memSize) / 1024 / 1024.0).ToString("N0") + "MB, " + stopWatch.Elapsed.TotalMilliseconds.ToString("0.0") + "ms. Tokens:" + frequency.Count);
            stopWatch = new Stopwatch();
            stopWatch.Start();
            Console.WriteLine("Saving index ...");
            stopWatch.Stop();
            long byteCount = SaveIndex(indexerPath, trie, spellChecker, inverter, dict, documents);

            stopWatch = new Stopwatch();
            Console.WriteLine("File saved: " + byteCount + " bytes. Time ellapsed: " + stopWatch.Elapsed.TotalMilliseconds.ToString("0.0") + "ms");
            stopWatch.Start();
            Console.WriteLine("Searching ...");
            var hits = Search("342 cw", trie, spellChecker, inverter, dict, documents, 10);

            stopWatch.Stop();
            var timeEllapsed = stopWatch.Elapsed.TotalMilliseconds.ToString("0.0");

            // foreach(var hit in hits) {
            //     Console.WriteLine("--> " + hit.value);
            // }

            Console.WriteLine("Searching done." + timeEllapsed + "ms. Hits:" + hits.Length);
        }
Example #3
0
        static SearchResult[] Search(
            // input query
            string query,
            // trie for prefix/infix matching
            PatriciaSuffixTrie <string> trie,
            SymSpell symSpell,
            // inverted index
            Dictionary <int, HashSet <int> > inverter,
            // word -> its order
            Dictionary <string, int> dict,
            // collection of documents
            List <string> documents,
            // limit
            int limit
            )
        {
            var aggregated = new Dictionary <int, SearchResult>();
            var tokens     = new LinkedList <string>();

            foreach (var word in query.ToLower().Split(' '))
            {
                tokens.AddLast(word);
            }

            while (tokens.Count > 0)
            {
                // pop_front the queue
                var word = tokens.First.Value;
                tokens.RemoveFirst();
                // pipeline:
                // 1. find exact matches first
                int tmp;
                if (dict.TryGetValue(word, out tmp))
                {
                    var docs = inverter[tmp];
                    foreach (var doc in docs)
                    {
                        SearchResult tempSearchResult;
                        // add to aggregated result
                        if (!aggregated.TryGetValue(doc, out tempSearchResult))
                        {
                            aggregated[doc] = new SearchResult(documents[doc], 1);;
                        }
                        else
                        {
                            tempSearchResult.score++;
                        }
                    }
                    continue;
                }

                // if no exact match then search for prefix suggestions (for prefix <= 3)
                if (word.Length <= 3)   // find prefix matches
                {
                    string suggestion = null;
                    // take 1 suggestion first
                    foreach (var suggest in trie.Retrieve(word))
                    {
                        suggestion = suggest;
                        Console.WriteLine("Prefix matched: " + suggestion);
                        break;
                    }
                    if (suggestion != null)
                    {
                        // push_front
                        tokens.AddFirst(suggestion);
                        continue;
                    }
                }

                // if no prefix suggestion found then correct spelling
                var lookupResult = symSpell.LookupCompound(word)[0].term.Split(' ');
                for (int i = lookupResult.Length - 1; i >= 0; --i)
                {
                    tokens.AddFirst(lookupResult[i]);
                }
            }

            // then sort??
            return(aggregated.Values.ToArray());
        }
Example #4
0
        /**
         * Construct airportTree, cityTree, and countryTree
         * REQUIREMENT: 
         * 1. The list of airports should be constructed first
         */
        public void constructAirportTrees()
        {
            // initialize the fields
            airportTree = new PatriciaSuffixTrie<AirportObject.Airport>(1);
            cityTree = new PatriciaSuffixTrie<AirportObject.Airport>(1);
            countryTree = new PatriciaSuffixTrie<AirportObject.Airport>(1);

            for (int i = 0; i < airports.Count; i++)
            {
                AirportObject.Airport a = airports[i];

                // skip it the airport name is null
                if (a.name == null) continue;

                // insert airport name, city, and country to each tree
                airportTree.Add(a.name.ToLower(), a);
                cityTree.Add(a.city.ToLower(), a);
                countryTree.Add(a.country.ToLower(), a);
            }
        }