Пример #1
0
        private void processregular(DecodedTextClass res, int  threadid, int links_threshold = 5, int minimum_length=100)
        {
            string title = res.title.Trim();
            int id = res.identifier;
            conceptdata.TryAdd(id, new conceptstats());
            conceptdata[id].valid = true;

            //Console.WriteLine("Case 1");
            conceptdata[id].title = title;
            HashSet<string> categories = new HashSet<string>();
            HashSet<string> redirects = new HashSet<string>();

            char[] chararray = new char[0];
            int startindex = 0;
            int length = 0;
            int type = 0;

            int constructs = res.NumberWikiConstructs();

            if (constructs < links_threshold)
            {
                //invalidate this concept
                conceptdata[id].valid = false;
                return;
            }
            if (res.NumberGoodWords() < minimum_length)
            {
                //invalidate this concept
                conceptdata[id].valid = false;
                return;
            }

            string curr, cat, redir;
            for (int i = 0; i < constructs; i++)
            {
                if (res.GetWikiConstruct(i, ref type, ref chararray, ref startindex, ref length))
                {
                    switch (type)
                    {
                        case 0:
                            //Console.WriteLine("Case 0");
                            curr = new string(chararray, startindex, length);
                            curr = curr.Trim();
                            // add to outlinks
                            if (!conceptdata[id].outlinks.Contains(curr))
                            {
                               conceptdata[id].outlinks.Add(curr);
                            }
                            break;
                        case 1:
                            //Console.WriteLine("Case 1");
                            cat = cleancategorytitle(new string(chararray, startindex, length), true);
                            if (!categories.Contains(cat))
                            {
                                categories.Add(cat);
                            }
                            break;
                        case 2:
                            //Console.WriteLine("Case 2");
                            redir = new string(chararray, startindex, length);
                            if (!redirects.Contains(redir))
                            {
                                redirects.Add(redir);
                                //Console.WriteLine("{0}: {1}",res.title,redir);
                            }
                            break;
                        default:
                            break;
                    }
                }
            }

            // set some of the other info
            conceptdata[id].categories = categories;
            conceptdata[id].redirects = redirects;

            //now process words
            processText(res,threadid);
            return;
        }
Пример #2
0
        private void processcategory(DecodedTextClass res)
        {
            // structure to store category information
            //Dictionary<string, int> categorydict;
            //Dictionary<int, int[]> parentcategorydict;
            string modifiedtitle = cleancategorytitle(res.title, false); // string includes "category:"
            categorydict.TryAdd(modifiedtitle,res.identifier);

            char[] chararray = new char[0];
            int startindex = 0;
            int length = 0;
            int type = 0;

            HashSet<string> categoriesarray = new HashSet<string>();
            int n = res.NumberWikiConstructs();
            for (int i = 0; i < n; i++)
            {
                if (res.GetWikiConstruct(i, ref type, ref chararray, ref startindex, ref length))
                {
                    switch (type)
                    {
                        case 1:
                            string cat = cleancategorytitle(new string(chararray, startindex, length), true);
                            if (!categoriesarray.Contains(cat))
                            {
                                categoriesarray.Add(cat);
                            }
                            break;
                    }
                }
            }
            parentcategorydict.TryAdd(modifiedtitle,categoriesarray);
        }
Пример #3
0
        private void processdisambig(DecodedTextClass res)
        {
            HashSet<string> links = new HashSet<string>();

            char[] chararray = new char[0];
            int startindex = 0;
            int length = 0;
            int type = 0;

            int n = res.NumberWikiConstructs();
            for (int i = 0; i < n; i++)
            {
                if (res.GetWikiConstruct(i, ref type, ref chararray, ref startindex, ref length))
                {
                    switch (type)
                    {
                        case 0:
                            string curr = new string(chararray, startindex, length);
                            curr = curr.Trim();
                            if (!links.Contains(curr))
                            {
                                links.Add(curr);
                                //Console.WriteLine(curr);
                            }
                            break;
                    }
                }
            }
            //Console.WriteLine(res.title);
            disambigredirect.TryAdd(res.title, links);
        }
Пример #4
0
        private bool isdisambig(DecodedTextClass res)
        {
            char[] chararray = new char[0];
            int startindex = 0;
            int length = 0;
            int type = 0;

            List<int> categoriesarray = new List<int>(res.NumberWikiConstructs());
            string cat;
            int n = res.NumberWikiConstructs();
            for (int i = 0; i < n; i++)
            {
                if (res.GetWikiConstruct(i, ref type, ref chararray, ref startindex, ref length))
                {
                    switch (type)
                    {
                        case 1: // look at categories
                            cat = cleancategorytitle(new string(chararray, startindex, length), true); // already open because processor gets rid of "Category:"
                            if (cat == "Disambiguation pages")
                            {
                                //StreamWriter sw = new StreamWriter("complexdisambiguation.txt", true);
                                //sw.WriteLine(res.title + "," + res.identifier);
                                //sw.Close();
                                return true;
                            }
                            break;
                    }
                }
            }
            return false;
        }