Example #1
0
        static void Main(string[] args)
        {
            StreamReader stream = new StreamReader(args[0]);
            string body = stream.ReadToEnd();
            MemoryManager mem = new MemoryManager(4000000, 4000000);
            DecodedTextClass content = new DecodedTextClass(mem, true);
            HTMLWikiProcessor textproc = new HTMLWikiProcessor(new HashSet<int>(), false);
            textproc.LoadDecodedTextClass(ref content);
            content.resetDecoder();
            textproc.ProcessHTML(body);

            StreamWriter sw = new StreamWriter("words.txt");
            string[] tokens = content.GetTokens();
            sw.Write(string.Join(",", tokens));
            sw.Close();

            // if text is an html page, we can extract the title
            sw = new StreamWriter("title.txt");
            tokens = content.GetTitleTokens();
            sw.Write(string.Join(",", tokens));
            sw.Close();

            // if text is an html page, we can extract text only from within div's with a matching id
            content.resetDecoder(); // need to reset to reuse the DecodedTextClass object
            HashSet<string> divfilters = new HashSet<string>();
            divfilters.Add("id=\"articleBody\"");
            divfilters.Add("class=\"articleBody\"");

            textproc.ProcessDivHTML(body, divfilters);
            sw = new StreamWriter("specificdiv.txt");
            sw.Write(string.Join(",", tokens));
            sw.Close();
        }
Example #2
0
        static void Main(string[] args)
        {
            StreamReader      stream   = new StreamReader(args[0]);
            string            body     = stream.ReadToEnd();
            MemoryManager     mem      = new MemoryManager(4000000, 4000000);
            DecodedTextClass  content  = new DecodedTextClass(mem, true);
            HTMLWikiProcessor textproc = new HTMLWikiProcessor(new HashSet <int>(), false);

            textproc.LoadDecodedTextClass(ref content);
            content.resetDecoder();
            textproc.ProcessHTML(body);

            StreamWriter sw = new StreamWriter("words.txt");

            string[] tokens = content.GetTokens();
            sw.Write(string.Join(",", tokens));
            sw.Close();

            // if text is an html page, we can extract the title
            sw     = new StreamWriter("title.txt");
            tokens = content.GetTitleTokens();
            sw.Write(string.Join(",", tokens));
            sw.Close();

            // if text is an html page, we can extract text only from within div's with a matching id
            content.resetDecoder(); // need to reset to reuse the DecodedTextClass object
            HashSet <string> divfilters = new HashSet <string>();

            divfilters.Add("id=\"articleBody\"");
            divfilters.Add("class=\"articleBody\"");

            textproc.ProcessDivHTML(body, divfilters);
            sw = new StreamWriter("specificdiv.txt");
            sw.Write(string.Join(",", tokens));
            sw.Close();
        }
Example #3
0
        static void addCatConceptWords(ref MemoryManager mem, ref WikiData wikidata, ref HashSet<int> reservedWords)
        {
            char[] chararray = new char[0];
            int startindex = 0;
            int length = 0;
            bool sticky = false;
            bool stopword = false;
            int division = 0;
            bool isInt = false;
            int decodedInt = -1;

            // create a new text processor object
            HTMLWikiProcessor hwproc = new HTMLWikiProcessor(new HashSet<int>(), false);
            DecodedTextClass dtc = new DecodedTextClass(mem, true);
            hwproc.LoadDecodedTextClass(ref dtc);

            int maxid = 0;
            foreach (KeyValuePair<string, int> kvp in wikidata.worddict)
            {
                if (kvp.Value > maxid)
                {
                    maxid = kvp.Value;
                }
            }
            maxid++;
            foreach (KeyValuePair<int, WikiData.conceptstats> kvp in wikidata.conceptdata)
            {
                if (!kvp.Value.valid)
                {
                    continue;
                }
                wikidata.conceptdict.Add(kvp.Value.title,kvp.Key);
                dtc.resetDecoder();
                hwproc.ProcessHTML(kvp.Value.title);
                // loop through the resulting words
                int len = dtc.NumberWords();
                int[] output = new int[len];
                for (int i = 0; i < len; i++)
                {
                    if (dtc.GetWord(i, ref chararray, ref startindex, ref length, ref sticky, ref division, ref stopword, ref isInt, ref decodedInt))
                    {
                        if (!stopword && !isInt)
                        {
                            string token = (new string(chararray, startindex, length));
                            if (!wikidata.worddict.ContainsKey(token))
                            {
                                wikidata.worddict.TryAdd(token, maxid);
                                if (!wikidata.wordidf.TryAdd(maxid, 1))
                                {
                                    ++wikidata.wordidf[maxid];
                                }
                                maxid++;
                            }
                            if (!reservedWords.Contains(wikidata.worddict[token]))
                            {
                                reservedWords.Add(wikidata.worddict[token]);
                            }
                            output[i] = wikidata.worddict[token];
                        }
                        else
                        {
                            if (isInt && decodedInt>0)
                            {
                                output[i]=-1-decodedInt;
                            }
                            else
                            {
                                output[i] = -1;
                            }
                        }
                    }
                }
                kvp.Value.titleArray = output;
            }
            foreach (KeyValuePair<string, int> kvp in wikidata.categorydict)
            {
                dtc.resetDecoder();
                hwproc.ProcessHTML(kvp.Key);
                // loop through the resulting words
                int len = dtc.NumberWords();
                int[] output = new int[len];
                for (int i = 0; i < len; i++)
                {
                    if (dtc.GetWord(i, ref chararray, ref startindex, ref length, ref sticky, ref division, ref stopword, ref isInt, ref decodedInt))
                    {
                        if (!stopword && !isInt)
                        {
                            string token = (new string(chararray, startindex, length));
                            if (!wikidata.worddict.ContainsKey(token))
                            {
                                wikidata.worddict.TryAdd(token, maxid);
                                if (!wikidata.wordidf.TryAdd(maxid, 1))
                                {
                                    ++wikidata.wordidf[maxid];
                                }
                                maxid++;
                            }
                            if (!reservedWords.Contains(wikidata.worddict[token]))
                            {
                                reservedWords.Add(wikidata.worddict[token]);
                            }
                            output[i] = wikidata.worddict[token];
                        }
                        else
                        {
                            if (isInt && decodedInt > 0)
                            {
                                output[i] = -1 - decodedInt;
                            }
                            else
                            {
                                output[i] = -1;
                            }
                        }
                    }
                }
                wikidata.categoryTitleArray.TryAdd(kvp.Key,output);
            }
        }
Example #4
0
        static void Main(string[] args)
        {
            //read parameters from commandline
            Arguments CommandLine = new Arguments(args);
            string wikipath;
            int numthreads;
            wikistream wikimediastream = new wikistream();

            if (CommandLine["help"] != null)
            {
                help();
                return;
            }

            if (CommandLine["wiki"] != null)
            {
                Console.WriteLine("Using wikimedia file: " + CommandLine["wiki"]);
                wikipath = CommandLine["wiki"];
            }
            else
            {
                Console.WriteLine("No wikimedia file provided!");
                return;
            }
            FileStream fileStreamIn;
            try
            {
                fileStreamIn = new FileStream(wikipath, FileMode.Open, FileAccess.Read);
            }
            catch
            {
                Console.WriteLine("Cannot access wikimedia file!");
                return;
            }
            //determine type of stream
            string[] els = wikipath.Split('.');
            switch (els[els.Length - 1])
            {
                case "xml":
                    wikimediastream.xmlstream = fileStreamIn;
                    wikimediastream.type = "xml";
                    break;
                case "gz":
                    try
                    {
                        wikimediastream.gzipstream = new GZipStream(fileStreamIn,CompressionMode.Decompress);
                        wikimediastream.type = "gz";
                    }
                    catch
                    {
                        Console.WriteLine("This gzipped wikimedia archive is invalid.");
                        return;
                    }
                    break;
                case "bz2":
                    try
                    {
                        wikimediastream.bzip2stream = new BZip2InputStream(fileStreamIn);
                        wikimediastream.type = "bz2";
                    }
                    catch
                    {
                        Console.WriteLine("This bzip2 wikimedia archive is invalid.");
                        return;
                    }
                    break;
                default:
                    Console.WriteLine("This wikimedia file seems to be neither an XML file nor a valid gzip or bzip2 archive.");
                    return;
            }
            numthreads = 2;
            if (CommandLine["threads"] != null)
            {
                try
                {
                    Console.WriteLine("Number of worker threads: " + CommandLine["threads"]);
                    numthreads = Convert.ToInt32(CommandLine["threads"]);
                }
                catch
                {
                    Console.WriteLine("Invalid number of worker threads (has to lie between 1 and 128)");
                    return;
                }
                if ((numthreads < 1) || (numthreads > 128))
                {
                    Console.WriteLine("Invalid number of worker threads (has to lie between 1 and 128)");
                    return;
                }
            }
            else
            {
                Console.WriteLine("Number of worker threads: 2 (default)");
            }
            int wordthreshold = 3;
            int bigramthreshold = 3;
            int minconceptlength = 100;
            int minconstructs = 5;
            if (CommandLine["wordthreshold"] != null)
            {
                if (!Int32.TryParse(CommandLine["wordthreshold"], out wordthreshold) || wordthreshold <=0)
                {
                    Console.WriteLine("Invalid wordthreshold (has to be positive integer)");
                    return;
                }
            }
            if (CommandLine["bigramthreshold"] != null)
            {
                if (!Int32.TryParse(CommandLine["bigramthreshold"], out bigramthreshold) || bigramthreshold <= 0)
                {
                    Console.WriteLine("Invalid bigramthreshold (has to be positive integer)");
                    return;
                }
            }
            if (CommandLine["minconceptlength"] != null)
            {
                if (!Int32.TryParse(CommandLine["minconceptlength"], out minconceptlength) || minconceptlength <= 0)
                {
                    Console.WriteLine("Invalid minceptlength (has to be positive integer)");
                    return;
                }
            }
            if (CommandLine["minconstructs"] != null)
            {
                if (!Int32.TryParse(CommandLine["minconstructs"], out minconstructs) || minconstructs <= 0)
                {
                    Console.WriteLine("Invalid minconstructs (has to be positive integer)");
                    return;
                }
            }
            bool debug = false;
            if (CommandLine["debug"] != null)
            {
                debug = true;
            }

            //now declare memory class and a number of decoder classes (plus queue for filled decoders)
            MemoryManager mem = new MemoryManager(4000000, 4000000);

            // objects associated with two numbering queues
            int processed_docs = 0;
            Object processedlock = new Object();
            Object debuglock = new Object();

            int activethreads = numthreads;

            //reader thread
            //this thread reads byte[] into pages queue one page at a time
            ConcurrentQueue<byte[]> pages = new ConcurrentQueue<byte[]>();
            bool reader_done = false;
            new Thread(delegate()
            {
                wikireader.read(wikimediastream, ref pages, ref reader_done);
            }).Start();

            //two numbering threads
            // misc processing thread
            WikiData wikidata = new WikiData(numthreads);

            //now start worker threads
            object lockthread = new object();
            int badpages = 0;

            Object threadlock = new Object();
            int threadcounter = 0;

            for (int t = 0; t < numthreads; t++)
            {
                new Thread(delegate()
                {
                    //this is the worker thread
                    DecodedTextClass element = new DecodedTextClass(mem, true);
                    HTMLWikiProcessor textproc = new HTMLWikiProcessor(new HashSet<int>(), false);
                    textproc.LoadDecodedTextClass(ref element);

                    int threadid;
                    lock (threadlock)
                    {
                        threadid = threadcounter;
                        ++threadcounter;
                    }
                    string lasttitle = "";
                    string lastbody = "";
                    string lastid = "";
                    while (1 == 1)
                    {
                        int status = 0;
                        while (1 == 1)
                        {
                            if (reader_done && status == 0)
                            {
                                status = 1;
                            }
                            //get new page
                            byte[] singlepage;
                            if (pages.TryDequeue(out singlepage))
                            {
                                string body = "";
                                string identifier = "";
                                string title = "";
                                bool redirect = false;
                                string redirecttitle = "";
                                //we only process off the queue one page at a time, so we don't have to worry about storing
                                //multiple titles, etc... -- only one of each per string at a time
                                //description of refs given in pageextractor.cs
                                PageExtractor.GetPage(singlepage, ref body, ref identifier, ref title, ref redirect, ref redirecttitle);
                                /*
                                lock (debuglock)
                                {
                                    raw.WriteLine(identifier + "," + title);
                                }
                                */
                                if (body != "" && identifier != "" && title != "")
                                {
                                    //element is the decodedtextclass object
                                    element.identifier = Convert.ToInt32(identifier);
                                    element.title = title;
                                    element.redirect = redirect;
                                    element.redirecttitle = redirecttitle;
                                    try
                                    {
                                        //releases memory
                                        element.resetDecoder();
                                        //textproc is the HTMLWikiProcessor object (has element attached)
                                        //element is ref in htmlwikiprocessor, so this modifies it
                                        textproc.ProcessHTML(body);
                                    }
                                    catch (Exception e)
                                    {
                                        lock (debuglock)
                                        {
                                            StreamWriter error = new StreamWriter("error.txt", true);
                                            error.WriteLine("ERROR PROCESSING PAGE");
                                            error.WriteLine("-------------");
                                            error.WriteLine(e.Message);
                                            error.WriteLine(e.StackTrace);
                                            error.WriteLine("-------------");
                                            error.Write(body);
                                            error.Close();
                                        }
                                        continue;
                                    }

                                    // determine the type of page by the title text
                                    if (title.EndsWith("(disambiguation)"))
                                    {
                                        element.isdisambig = true;
                                        element.title = title.Replace("(disambiguation)", " ").Trim();
                                    }
                                    else
                                    {
                                        element.isdisambig = false;
                                    }
                                    if (title.StartsWith("Category:"))
                                    {
                                        element.iscategory = true;
                                        element.title = title.Replace("Category:", " ").Trim();
                                    }
                                    else
                                    {
                                        element.iscategory = false;
                                    }
                                    // process the current element
                                    //element is the decodedtextclass object
                                    wikidata.process(ref element, threadid, minconstructs, minconceptlength);
                                    lock (processedlock)
                                    {
                                        ++processed_docs;
                                    }
                                }
                                else
                                {
                                    badpages++;
                                    lock (debuglock)
                                    {
                                        StreamWriter sw = new StreamWriter("badpages.txt", true);
                                        sw.WriteLine("ERROR ON PAGE");
                                        sw.WriteLine("-------------");
                                        sw.Write(Encoding.UTF8.GetString(singlepage));
                                        sw.WriteLine();
                                        sw.Close();
                                    }
                                }
                                lasttitle = title;
                                lastbody = body;
                                lastid = identifier;
                                break;
                            }
                            else
                            {
                                if (status == 1)
                                {
                                    status = 2;
                                }
                                Thread.Sleep(10);
                            }
                            if (status == 2)
                            {
                                break;
                            }
                        }
                        if (status == 2)
                        {
                            lock (lockthread)
                            {
                                activethreads--;
                                /*
                                StreamWriter sw = new StreamWriter("lastpage" + threadid + ".txt");
                                sw.WriteLine(lasttitle);
                                sw.WriteLine(lastbody);
                                sw.WriteLine(lastid);
                                sw.Close();
                                 */
                            }
                            break;
                        }
                    }
                }).Start();
            }

            //main thread is waiting for other threads to finish
            DateTime startime = DateTime.Now;
            while (1 == 1)
            {
                MemoryManager.memorystats stats = mem.GetMemStats();
                double avg = processed_docs / (DateTime.Now - startime).TotalSeconds;
                Console.Write("P: {0}, P/s: {1:#.##}, PQ:{2}, M(c/i): {3}/{4} %, R/P: {5}/{6}, of {7}, at: {8} \r", processed_docs, avg, pages.Count, Math.Round(100 * stats.usecharmem), Math.Round(100 * stats.usedshortmem), stats.reserveincidents, mem.priorityqueue, stats.overflow, activethreads);
                //Console.Write("P:{0},P/sec:{1:#.##},b:{2},EQ:{3},Q1:{4},Q2:{5},PQ:{6} \r",
                //   processed_docs, avg, badpages, emptycontent_queue.Count, output1.Count, output2.Count, pages.Count);
                //Console.Write("P:{0},P/sec:{1:#.##},words:{2},widf:{3},concepts:{4}             \r",
                //    processed_docs, avg, tp.worddict.Count, tp.wordidf.Count, tp.conceptwords.Count);

                if (activethreads == 0)
                {
                    //raw.Close();
                    Console.WriteLine("P: {0}, P/s: {1:#.##}, PQ:{2}, M(c/i): {3}/{4} %, R/P: {5}/{6}, of {7}, at: {8}", processed_docs, avg, pages.Count, Math.Round(100 * stats.usecharmem), Math.Round(100 * stats.usedshortmem), stats.reserveincidents, mem.priorityqueue, stats.overflow, activethreads);
                    break;
                }
                Thread.Sleep(1000);
            }

            // proceed to final step
            Console.WriteLine("Finishing up ...");

            // read in stop words
            string dir = System.IO.Path.GetDirectoryName(System.Reflection.Assembly.GetExecutingAssembly().Location);
            StreamReader sr = new StreamReader(dir+"/"+"stopwords.csv");
            string stopwordstr = sr.ReadToEnd();
            sr.Close();
            HashSet<int> reservedWords = new HashSet<int>();
            HashSet<int> rawstopwords = convertStopWords(stopwordstr, ref mem, ref wikidata, ref reservedWords);

            //now add concept title and categories to word dictionary
            addCatConceptWords(ref mem, ref wikidata, ref reservedWords);
            //now remove empty string
            foreach (KeyValuePair<string, int> kvp in wikidata.worddict)
            {
                if (kvp.Key.Trim() == "")
                {
                    int wordid;
                    wikidata.worddict.TryRemove(kvp.Key, out wordid);
                    if (reservedWords.Contains(wordid))
                    {
                        reservedWords.Remove(wordid);
                    }
                }
            }

            //create output files
            #region cleanwordsconcepts
            //delete words below the threshold
            HashSet<int> validwords = new HashSet<int>();
            foreach (KeyValuePair<string, int> kvp in wikidata.worddict)
            {
                if (wikidata.wordidf[kvp.Value] >= wordthreshold || reservedWords.Contains(kvp.Value))
                {
                    validwords.Add(kvp.Value);
                }
            }
            //get list of valid concepts
            HashSet<int> validconcepts = new HashSet<int>();
            foreach (KeyValuePair<int, WikiData.conceptstats> kvp in wikidata.conceptdata)
            {
                if (kvp.Value.valid)
                {
                    validconcepts.Add(kvp.Key);
                }
            }
            //get list of categories
            HashSet<int> validcats = new HashSet<int>();
            foreach (KeyValuePair<string, int> kvp in wikidata.categorydict)
            {
                validcats.Add(kvp.Value);
            }

            //now create crosswalks
            int[] dummy = validwords.ToArray<int>();
            Dictionary<int, int> crosswalk_words = new Dictionary<int, int>();
            for (int i = 0; i < dummy.Length; i++)
            {
                crosswalk_words.Add(dummy[i], i);
            }
            dummy = validconcepts.ToArray<int>();
            Dictionary<int, int> crosswalk_concepts = new Dictionary<int, int>();
            for (int i = 0; i < dummy.Length; i++)
            {
                crosswalk_concepts.Add(dummy[i], i);
            }
            dummy = validcats.ToArray<int>();
            Dictionary<int, int> crosswalk_cats = new Dictionary<int, int>();
            for (int i = 0; i < dummy.Length; i++)
            {
                crosswalk_cats.Add(dummy[i], i);
            }
            #endregion cleanwordsconcepts

            Console.WriteLine("Calculating frequency resources ...");
            createFreqResources(numthreads, wikidata, validwords, validconcepts, crosswalk_words, crosswalk_concepts,rawstopwords,bigramthreshold);

            DateTimeOffset start = DateTimeOffset.Now;
            #region concept_outputfiles
            Console.WriteLine("Writing concept output files ...");
            //write concepts
            string filename = "concepts.txt";
            File.Delete(filename);
            StreamWriter writer = new StreamWriter(filename);
            foreach(int id in validconcepts)
            {
                writer.WriteLine(wikidata.conceptdata[id].title+"\t"+crosswalk_concepts[id]+"\t"+wikidata.conceptdata[id].conceptwords.Length);
            }
            writer.Close();
            //write concept titles
            filename = "concept_titles.txt";
            File.Delete(filename);
            writer = new StreamWriter(filename);
            foreach (int id in validconcepts)
            {
                //Console.WriteLine("{0}  {1}", id, crosswalk_concepts[id]);
                writer.Write(crosswalk_concepts[id]);
                //Console.WriteLine(wikidata.conceptdata[id].titleArray.Length);
                for (int i = 0; i < wikidata.conceptdata[id].titleArray.Length;i++)
                {
                    if (wikidata.conceptdata[id].titleArray[i] < 0)
                    {
                        writer.Write("\t" + wikidata.conceptdata[id].titleArray[i]);
                    }
                    else
                    {
                        writer.Write("\t" + crosswalk_words[wikidata.conceptdata[id].titleArray[i]]);
                    }
                }
                writer.WriteLine();
            }
            writer.Close();
            //write concept words
            filename = "concept_word.txt";
            File.Delete(filename);
            writer = new StreamWriter(filename);
            foreach (int id in validconcepts)
            {
                writer.Write(crosswalk_concepts[id]);
                for (int i = 0; i < wikidata.conceptdata[id].conceptwords.Length; i++)
                {
                    int k=wikidata.conceptdata[id].conceptwords[i];
                    if (k>=0)
                    {
                        if (validwords.Contains(k))
                        {
                            k = crosswalk_words[k];
                        }
                        else
                        {
                            k = -1;
                        }
                    }
                    writer.Write("\t" + k);
                }
                writer.WriteLine();
            }
            writer.Close();
            #endregion concept_outputfiles
            Console.WriteLine("Concept files saved in {0:F2} minutes ...",(DateTimeOffset.Now - start).TotalMinutes);

            start = DateTimeOffset.Now;
            #region categories_outputfiles
            Console.WriteLine("Writing category output files ...");
            //write categories
            filename = "categories.txt";
            File.Delete(filename);
            writer = new StreamWriter(filename);
            Dictionary<int, string> catlist = new Dictionary<int, string>();
            //Console.WriteLine(wikidata.categorydict.Count);
            foreach (KeyValuePair<string, int> kvp in wikidata.categorydict)
            {
                catlist.Add(kvp.Value, kvp.Key);
            }
            foreach (int id in validcats)
            {
                writer.WriteLine(catlist[id]+"\t"+crosswalk_cats[id]);
            }
            writer.Close();
            //write category words
            filename = "categories_titles.txt";
            File.Delete(filename);
            writer = new StreamWriter(filename);
            foreach (int id in validcats)
            {
                writer.Write(crosswalk_cats[id]);
                for (int i = 0; i < wikidata.categoryTitleArray[catlist[id]].Length; i++)
                {
                    writer.Write("\t" + wikidata.categoryTitleArray[catlist[id]][i]);
                }
                writer.WriteLine();
            }
            writer.Close();
            //write category parent matrix
            filename = "categories_parentmatrix.txt";
            File.Delete(filename);
            writer = new StreamWriter(filename);
            foreach (KeyValuePair<string, HashSet<string>> kvp in wikidata.parentcategorydict)
            {
                if (wikidata.categorydict.ContainsKey(kvp.Key))
                {
                    writer.Write(crosswalk_cats[wikidata.categorydict[kvp.Key]]);
                }
                foreach (string cat in kvp.Value)
                {
                    if (wikidata.categorydict.ContainsKey(cat))
                    {
                        writer.Write("\t"+crosswalk_cats[wikidata.categorydict[cat]]);
                    }
                }
                writer.WriteLine();
            }
            writer.Close();
            //write concept_categories
            filename = "concept_categories.txt";
            File.Delete(filename);
            writer = new StreamWriter(filename);
            foreach (int id in validconcepts)
            {
                writer.Write(crosswalk_concepts[id]);
                foreach (string cat in wikidata.conceptdata[id].categories)
                {
                    if (wikidata.categorydict.ContainsKey(cat))
                    {
                        writer.Write("\t" + crosswalk_cats[wikidata.categorydict[cat]]);
                    }
                }
                writer.WriteLine();
            }
            writer.Close();
            #endregion categories_outputfiles
            Console.WriteLine("Category files saved in {0:F2} minutes ...", (DateTimeOffset.Now - start).TotalMinutes);

            start = DateTimeOffset.Now;
            #region links_outputfiles
            Console.WriteLine("Writing links output files ...");
            //write disambiguation matrix
            filename = "disambiguation.txt";
            File.Delete(filename);
            writer = new StreamWriter(filename);
            foreach (KeyValuePair<string, HashSet<string>> kvp in wikidata.disambigredirect)
            {
                writer.Write(kvp.Key);
                foreach (string child in kvp.Value)
                {
                    //Console.WriteLine(child);
                    if (wikidata.conceptdict.ContainsKey(child))
                    {
                        int id2 = wikidata.conceptdict[child];
                        if (validconcepts.Contains(id2))
                        {
                            writer.Write("\t" + crosswalk_concepts[id2]);
                        }
                    }
                }
                writer.WriteLine();
            }
            writer.Close();
            //write redirects
            filename = "redirects.txt";
            File.Delete(filename);
            writer = new StreamWriter(filename);
            foreach (KeyValuePair<string,string> kvp in wikidata.conceptredirects)
            {
                if (kvp.Value == null)
                {
                    continue;
                }
                writer.Write(kvp.Key);
                if (wikidata.conceptdict.ContainsKey(kvp.Value))
                {
                    int id2 = wikidata.conceptdict[kvp.Value];
                    if (validconcepts.Contains(id2))
                    {
                        writer.Write("\t" + crosswalk_concepts[id2]);
                    }
                }
                writer.WriteLine();
            }
            writer.Close();
            //write concept_outlinks
            filename = "concept_outlinks.txt";
            File.Delete(filename);
            writer = new StreamWriter(filename);
            foreach (int id in validconcepts)
            {
                writer.Write(crosswalk_concepts[id]);
                foreach (string link in wikidata.conceptdata[id].outlinks)
                {
                    if (wikidata.conceptdict.ContainsKey(link))
                    {
                        int id2 = wikidata.conceptdict[link];
                        if (validconcepts.Contains(id2))
                        {
                            writer.Write("\t"+crosswalk_concepts[id2]);
                        }
                    }
                }
                writer.WriteLine();
            }
            writer.Close();
            #endregion links_outputfiles
            Console.WriteLine("Links files saved in {0:F2} minutes ...", (DateTimeOffset.Now - start).TotalMinutes);
        }
Example #5
0
        static HashSet<int> convertStopWords(string rawstopwords, ref MemoryManager mem, ref WikiData wikidata, ref HashSet<int> reservedWords)
        {
            char[] chararray = new char[0];
            int startindex = 0;
            int length = 0;
            bool sticky = false;
            bool stopword = false;
            int division = 0;
            bool isInt = false;
            int decodedInt = -1;

            // create a new text processor object
            HTMLWikiProcessor hwproc = new HTMLWikiProcessor(new HashSet<int>(), false);
            DecodedTextClass dtc = new DecodedTextClass(mem, true);
            hwproc.LoadDecodedTextClass(ref dtc);
            hwproc.ProcessHTML(rawstopwords);

            // perform cleanup on the data
            HashSet<int> rawids = new HashSet<int>();

            // loop through the resulting words
            int len = dtc.NumberWords();
            int maxid=0;
            foreach(KeyValuePair<string,int> kvp in wikidata.worddict)
            {
                if (kvp.Value>maxid)
                {
                    maxid=kvp.Value;
                }
            }
            maxid++;
            for (int i = 0; i < len; i++)
            {
                if (dtc.GetWord(i, ref chararray, ref startindex, ref length, ref sticky, ref division, ref stopword, ref isInt, ref decodedInt))
                {
                    if (!isInt)
                    {
                        string token = (new string(chararray, startindex, length));
                        if (!wikidata.worddict.ContainsKey(token))
                        {
                        if (!wikidata.wordidf.TryAdd(maxid, 1))
                        {
                            ++wikidata.wordidf[maxid];
                        }
                            wikidata.worddict.TryAdd(token, maxid);
                            if (!wikidata.wordidf.TryAdd(maxid, 1))
                            {
                                ++wikidata.wordidf[maxid];
                            }
                            maxid++;
                        }
                        if (!reservedWords.Contains(wikidata.worddict[token]))
                        {
                            reservedWords.Add(wikidata.worddict[token]);
                        }
                        if (!rawids.Contains(wikidata.worddict[token]))
                        {
                            rawids.Add(wikidata.worddict[token]);
                        }
                    }
                }
            }
            return rawids;
        }