static HashSet<int> convertStopWords(string rawstopwords, ref MemoryManager mem, ref WikiData wikidata, ref HashSet<int> reservedWords) { char[] chararray = new char[0]; int startindex = 0; int length = 0; bool sticky = false; bool stopword = false; int division = 0; bool isInt = false; int decodedInt = -1; // create a new text processor object HTMLWikiProcessor hwproc = new HTMLWikiProcessor(new HashSet<int>(), false); DecodedTextClass dtc = new DecodedTextClass(mem, true); hwproc.LoadDecodedTextClass(ref dtc); hwproc.ProcessHTML(rawstopwords); // perform cleanup on the data HashSet<int> rawids = new HashSet<int>(); // loop through the resulting words int len = dtc.NumberWords(); int maxid=0; foreach(KeyValuePair<string,int> kvp in wikidata.worddict) { if (kvp.Value>maxid) { maxid=kvp.Value; } } maxid++; for (int i = 0; i < len; i++) { if (dtc.GetWord(i, ref chararray, ref startindex, ref length, ref sticky, ref division, ref stopword, ref isInt, ref decodedInt)) { if (!isInt) { string token = (new string(chararray, startindex, length)); if (!wikidata.worddict.ContainsKey(token)) { if (!wikidata.wordidf.TryAdd(maxid, 1)) { ++wikidata.wordidf[maxid]; } wikidata.worddict.TryAdd(token, maxid); if (!wikidata.wordidf.TryAdd(maxid, 1)) { ++wikidata.wordidf[maxid]; } maxid++; } if (!reservedWords.Contains(wikidata.worddict[token])) { reservedWords.Add(wikidata.worddict[token]); } if (!rawids.Contains(wikidata.worddict[token])) { rawids.Add(wikidata.worddict[token]); } } } } return rawids; }
static void addCatConceptWords(ref MemoryManager mem, ref WikiData wikidata, ref HashSet<int> reservedWords) { char[] chararray = new char[0]; int startindex = 0; int length = 0; bool sticky = false; bool stopword = false; int division = 0; bool isInt = false; int decodedInt = -1; // create a new text processor object HTMLWikiProcessor hwproc = new HTMLWikiProcessor(new HashSet<int>(), false); DecodedTextClass dtc = new DecodedTextClass(mem, true); hwproc.LoadDecodedTextClass(ref dtc); int maxid = 0; foreach (KeyValuePair<string, int> kvp in wikidata.worddict) { if (kvp.Value > maxid) { maxid = kvp.Value; } } maxid++; foreach (KeyValuePair<int, WikiData.conceptstats> kvp in wikidata.conceptdata) { if (!kvp.Value.valid) { continue; } wikidata.conceptdict.Add(kvp.Value.title,kvp.Key); dtc.resetDecoder(); hwproc.ProcessHTML(kvp.Value.title); // loop through the resulting words int len = dtc.NumberWords(); int[] output = new int[len]; for (int i = 0; i < len; i++) { if (dtc.GetWord(i, ref chararray, ref startindex, ref length, ref sticky, ref division, ref stopword, ref isInt, ref decodedInt)) { if (!stopword && !isInt) { string token = (new string(chararray, startindex, length)); if (!wikidata.worddict.ContainsKey(token)) { wikidata.worddict.TryAdd(token, maxid); if (!wikidata.wordidf.TryAdd(maxid, 1)) { ++wikidata.wordidf[maxid]; } maxid++; } if (!reservedWords.Contains(wikidata.worddict[token])) { reservedWords.Add(wikidata.worddict[token]); } output[i] = wikidata.worddict[token]; } else { if (isInt && decodedInt>0) { output[i]=-1-decodedInt; } else { output[i] = -1; } } } } kvp.Value.titleArray = output; } foreach (KeyValuePair<string, int> kvp in wikidata.categorydict) { dtc.resetDecoder(); hwproc.ProcessHTML(kvp.Key); // loop through the resulting words int len = dtc.NumberWords(); int[] output = new int[len]; for (int i = 0; i < len; i++) { if (dtc.GetWord(i, ref chararray, ref startindex, ref length, ref sticky, ref division, ref stopword, ref isInt, ref decodedInt)) { if (!stopword && !isInt) { string token = (new string(chararray, startindex, length)); if (!wikidata.worddict.ContainsKey(token)) { wikidata.worddict.TryAdd(token, maxid); if (!wikidata.wordidf.TryAdd(maxid, 1)) { ++wikidata.wordidf[maxid]; } maxid++; } if (!reservedWords.Contains(wikidata.worddict[token])) { reservedWords.Add(wikidata.worddict[token]); } output[i] = wikidata.worddict[token]; } else { if (isInt && decodedInt > 0) { output[i] = -1 - decodedInt; } else { output[i] = -1; } } } } wikidata.categoryTitleArray.TryAdd(kvp.Key,output); } }
private void processText(DecodedTextClass res, int threadid) { char[] chararray = new char[0]; int startindex = 0; int length = 0; bool sticky = false; bool stopword = false; int division = 0; bool isInt = false; int decodedInt = -1; HashSet<int> added = new HashSet<int>(); int len = res.NumberWords(); string token; int[] stream = new int[len]; int counter = 0; for (int i = 0; i < len; i++) { if (res.GetWord(i, ref chararray, ref startindex, ref length, ref sticky, ref division, ref stopword, ref isInt, ref decodedInt)) { if (!stopword && !isInt) { token = (new string(chararray, startindex, length)); int wordid = numthreads *threadcounters[threadid] + threadid; if (!worddict.TryAdd(token, wordid)) { wordid = worddict[token]; } else // added { ++threadcounters[threadid]; } stream[counter] = wordid; ++counter; // for figuring out whether the word has already been added in the current concept. If not, increment in the IDF count (number of concepts the word appears in) if (!added.Contains(wordid)) { if (!wordidf.TryAdd(wordid, 1)) { ++wordidf[wordid]; } added.Add(wordid); } } else { //-1 indicates stopword/integer or other break of flow if (isInt && decodedInt>0) { stream[counter] = -1-decodedInt; } else { stream[counter] = -1; } ++counter; } } } // should work because id is unique conceptdata[res.identifier].conceptwords = stream; }