Exemplo n.º 1
0
        static HashSet<int> convertStopWords(string rawstopwords, ref MemoryManager mem, ref WikiData wikidata, ref HashSet<int> reservedWords)
        {
            char[] chararray = new char[0];
            int startindex = 0;
            int length = 0;
            bool sticky = false;
            bool stopword = false;
            int division = 0;
            bool isInt = false;
            int decodedInt = -1;

            // create a new text processor object
            HTMLWikiProcessor hwproc = new HTMLWikiProcessor(new HashSet<int>(), false);
            DecodedTextClass dtc = new DecodedTextClass(mem, true);
            hwproc.LoadDecodedTextClass(ref dtc);
            hwproc.ProcessHTML(rawstopwords);

            // perform cleanup on the data
            HashSet<int> rawids = new HashSet<int>();

            // loop through the resulting words
            int len = dtc.NumberWords();
            int maxid=0;
            foreach(KeyValuePair<string,int> kvp in wikidata.worddict)
            {
                if (kvp.Value>maxid)
                {
                    maxid=kvp.Value;
                }
            }
            maxid++;
            for (int i = 0; i < len; i++)
            {
                if (dtc.GetWord(i, ref chararray, ref startindex, ref length, ref sticky, ref division, ref stopword, ref isInt, ref decodedInt))
                {
                    if (!isInt)
                    {
                        string token = (new string(chararray, startindex, length));
                        if (!wikidata.worddict.ContainsKey(token))
                        {
                        if (!wikidata.wordidf.TryAdd(maxid, 1))
                        {
                            ++wikidata.wordidf[maxid];
                        }
                            wikidata.worddict.TryAdd(token, maxid);
                            if (!wikidata.wordidf.TryAdd(maxid, 1))
                            {
                                ++wikidata.wordidf[maxid];
                            }
                            maxid++;
                        }
                        if (!reservedWords.Contains(wikidata.worddict[token]))
                        {
                            reservedWords.Add(wikidata.worddict[token]);
                        }
                        if (!rawids.Contains(wikidata.worddict[token]))
                        {
                            rawids.Add(wikidata.worddict[token]);
                        }
                    }
                }
            }
            return rawids;
        }
Exemplo n.º 2
0
        static void addCatConceptWords(ref MemoryManager mem, ref WikiData wikidata, ref HashSet<int> reservedWords)
        {
            char[] chararray = new char[0];
            int startindex = 0;
            int length = 0;
            bool sticky = false;
            bool stopword = false;
            int division = 0;
            bool isInt = false;
            int decodedInt = -1;

            // create a new text processor object
            HTMLWikiProcessor hwproc = new HTMLWikiProcessor(new HashSet<int>(), false);
            DecodedTextClass dtc = new DecodedTextClass(mem, true);
            hwproc.LoadDecodedTextClass(ref dtc);

            int maxid = 0;
            foreach (KeyValuePair<string, int> kvp in wikidata.worddict)
            {
                if (kvp.Value > maxid)
                {
                    maxid = kvp.Value;
                }
            }
            maxid++;
            foreach (KeyValuePair<int, WikiData.conceptstats> kvp in wikidata.conceptdata)
            {
                if (!kvp.Value.valid)
                {
                    continue;
                }
                wikidata.conceptdict.Add(kvp.Value.title,kvp.Key);
                dtc.resetDecoder();
                hwproc.ProcessHTML(kvp.Value.title);
                // loop through the resulting words
                int len = dtc.NumberWords();
                int[] output = new int[len];
                for (int i = 0; i < len; i++)
                {
                    if (dtc.GetWord(i, ref chararray, ref startindex, ref length, ref sticky, ref division, ref stopword, ref isInt, ref decodedInt))
                    {
                        if (!stopword && !isInt)
                        {
                            string token = (new string(chararray, startindex, length));
                            if (!wikidata.worddict.ContainsKey(token))
                            {
                                wikidata.worddict.TryAdd(token, maxid);
                                if (!wikidata.wordidf.TryAdd(maxid, 1))
                                {
                                    ++wikidata.wordidf[maxid];
                                }
                                maxid++;
                            }
                            if (!reservedWords.Contains(wikidata.worddict[token]))
                            {
                                reservedWords.Add(wikidata.worddict[token]);
                            }
                            output[i] = wikidata.worddict[token];
                        }
                        else
                        {
                            if (isInt && decodedInt>0)
                            {
                                output[i]=-1-decodedInt;
                            }
                            else
                            {
                                output[i] = -1;
                            }
                        }
                    }
                }
                kvp.Value.titleArray = output;
            }
            foreach (KeyValuePair<string, int> kvp in wikidata.categorydict)
            {
                dtc.resetDecoder();
                hwproc.ProcessHTML(kvp.Key);
                // loop through the resulting words
                int len = dtc.NumberWords();
                int[] output = new int[len];
                for (int i = 0; i < len; i++)
                {
                    if (dtc.GetWord(i, ref chararray, ref startindex, ref length, ref sticky, ref division, ref stopword, ref isInt, ref decodedInt))
                    {
                        if (!stopword && !isInt)
                        {
                            string token = (new string(chararray, startindex, length));
                            if (!wikidata.worddict.ContainsKey(token))
                            {
                                wikidata.worddict.TryAdd(token, maxid);
                                if (!wikidata.wordidf.TryAdd(maxid, 1))
                                {
                                    ++wikidata.wordidf[maxid];
                                }
                                maxid++;
                            }
                            if (!reservedWords.Contains(wikidata.worddict[token]))
                            {
                                reservedWords.Add(wikidata.worddict[token]);
                            }
                            output[i] = wikidata.worddict[token];
                        }
                        else
                        {
                            if (isInt && decodedInt > 0)
                            {
                                output[i] = -1 - decodedInt;
                            }
                            else
                            {
                                output[i] = -1;
                            }
                        }
                    }
                }
                wikidata.categoryTitleArray.TryAdd(kvp.Key,output);
            }
        }
Exemplo n.º 3
0
        private void processText(DecodedTextClass res, int threadid)
        {
            char[] chararray = new char[0];
            int startindex = 0;
            int length = 0;
            bool sticky = false;
            bool stopword = false;
            int division = 0;
            bool isInt = false;
            int decodedInt = -1;

            HashSet<int> added = new HashSet<int>();
            int len = res.NumberWords();

            string token;
            int[] stream = new int[len];
            int counter = 0;
            for (int i = 0; i < len; i++)
            {
                if (res.GetWord(i, ref chararray, ref startindex, ref length, ref sticky, ref division, ref stopword, ref isInt, ref decodedInt))
                {
                    if (!stopword && !isInt)
                    {
                        token = (new string(chararray, startindex, length));

                        int wordid = numthreads *threadcounters[threadid] + threadid;
                        if (!worddict.TryAdd(token, wordid))
                        {
                            wordid = worddict[token];
                        }
                        else // added
                        {
                            ++threadcounters[threadid];
                        }
                        stream[counter] = wordid;
                        ++counter;

                        // for figuring out whether the word has already been added in the current concept. If not, increment in the IDF count (number of concepts the word appears in)
                        if (!added.Contains(wordid))
                        {
                            if (!wordidf.TryAdd(wordid, 1))
                            {
                                ++wordidf[wordid];
                            }
                            added.Add(wordid);
                        }
                    }
                    else
                    {
                        //-1 indicates stopword/integer or other break of flow
                        if (isInt && decodedInt>0)
                        {
                            stream[counter] = -1-decodedInt;
                        }
                        else
                        {
                            stream[counter] = -1;
                        }
                        ++counter;
                    }
                }
            }

            // should work because id is unique
            conceptdata[res.identifier].conceptwords = stream;
        }