Beispiel #1
0
        public static DiscreteSeriesDatabase <string> LoadRegionsDatabase(bool test = false, bool shorten = false, bool costarica = true, bool cuba = true)
        {
            //Load training data and create classifier.

            string directory = "../../res/regiones/";

            string[] regions = "españa argentina méxico colombia".Split(' ');

            string file = "";

            if (costarica)
            {
                regions = "costarica".Cons(regions).ToArray();
            }
            if (cuba)
            {
                regions = "cuba".Cons(regions).ToArray();
            }

            //string[] prefixes = new[]{"", "literatura", "historia", "lengua"};
            //file += prefixes.Select (prefix => regions.FoldToString ((sum, val) => sum + "region" + ":" + val + ";" + "type" + ":" + "news" + " " + prefix + val, "", "", "\n")).FoldToString ("", "", "\n");

            file += regions.Aggregate("", (sum, val) => sum + "region" + ":" + val + ";" + "type" + ":" + "news" + " " + val + "\n");
            file += regions.Aggregate("", (sum, val) => sum + "region" + ":" + val + ";" + "type" + ":" + "wiki" + " " + "literatura" + val + "\n");
            file += regions.Aggregate("", (sum, val) => sum + "region" + ":" + val + ";" + "type" + ":" + "wiki" + " " + "historia" + val + "\n");
            file += regions.Aggregate("", (sum, val) => sum + "region" + ":" + val + ";" + "type" + ":" + "wiki" + " " + "lengua" + val + "\n");
            file += regions.Aggregate("", (sum, val) => sum + "region" + ":" + val + ";" + "type" + ":" + "receta" + " " + "recetas" + val + "\n");

            if (!test)
            {
                {
                    string[] literatureRegions = "costarica costarica españa españa españa argentina argentina argentina argentina argentina argentina españa españa españa españa méxico méxico méxico méxico méxico méxico méxico colombia colombia colombia colombia colombia".Split(' ');
                    string[] literatureNames   = "leyendascr elisadelmar juanvaleraavuelaplumaespaña juanvaleraloscordobesesespaña marianela historiauniversal lamuerte buenosaires derroterosyviages fundaciondelaciudad laargentina mosenmillan historiadejudios viajosporespaña recuerdosybellezas leyendasmayas nahuatl laberinto comoaguaparachocolate mitoshorroresmexicanos leyendasmexicanas mitosurbanesmexicanos lamultituderrante viajoscolombianos leyendasurbanascolombianas mitoscolombianos mitoscolombianos2".Split(' ');

                    IEnumerable <string> classesStrings = literatureRegions.Select(r => "region:" + r + ";" + "type:" + "literature");

                    file += classesStrings.Zip(literatureNames, (thisClasses, thisPath) => thisClasses + " " + thisPath).Aggregate(new StringBuilder(), (sum, val) => sum.Append(val).Append("\n"));
                }

                {
                    string[] names = (
                        "salud antologia9 escorpionescr teca vacunoscr lanación universidadcr recetascostarica2 recetascostarica3 crcrawl presidentecostarica gobiernocostarica " +
                        "arqueologiamaya poesiamexicana catolicismosocial unam mxcrawl cocrawl cocrawl2 desplazadoscolombianos mexicocnn méxicolgbt méxicogob historiaazteca historiaazteca2 " +
                        "ordenamientoterretorrial competitividad ministerio"
                        ).Split(' ');
                    string[] tags = (
                        "region:costarica region:costarica region:costarica region:costarica region:costarica;type:paper region:costarica;type:news region:costarica region:costarica;type:receta region:costarica;type:receta region:costarica;type:website region:costarica;type:wiki region:costarica;type:wiki " +
                        "region:méxico region:méxico;type:paper region:méxico;type:paper region:méxico;type:paper region:méxico;type:website region:colombia;type:website region:colombia;type:website region:colombia;type:wiki region:méxico;type:news region:méxico;type:brochure region:méxico;type:website region:méxico region:méxico " +
                        "region:colombia region:colombia region:colombia"
                        ).Split(' ');

                    file += tags.Zip(names, (tag, name) => tag + " " + name).FoldToString("", "\n", "\n");
                }
            }

            if (cuba)
            {
                file += "region:cuba;type:wiki cubaisla\n";
                file += "region:cuba;type:receta recetascuba2\n";
                file += "region:cuba;type:receta recetascuba3\n";
                file += "region:cuba;type:literatura lahistoriame\n";
                file += "region:cuba;type:literatura elencuentro\n";
            }

            Console.WriteLine("Regions Database:");
            Console.WriteLine(file);

            TextReader reader = new StringReader(file);

            DiscreteSeriesDatabase <string> d = new DiscreteSeriesDatabase <string> ();

            d.LoadTextDatabase(directory, reader, DatabaseLoader.ProcessSpanishText, 3);

            if (shorten)
            {
                d = new DiscreteSeriesDatabase <string>(d.Select(item => new DiscreteEventSeries <string>(item.labels, item.data.Take(750).ToArray())));
            }

            return(d);
        }
Beispiel #2
0
        public static DiscreteSeriesDatabase <string> getNewsDataset(string fileName, int count = 0)
        {
            DiscreteSeriesDatabase <string> data = new DiscreteSeriesDatabase <string> ();

            using (StreamReader keyfile = File.OpenText(fileName + "key")) {
                if (count > 0)
                {
                    keyfile.BaseStream.Seek(-107 * count, System.IO.SeekOrigin.End);                      //avg line is ~81 characters.
                    keyfile.ReadLine();
                }
//				for(int i = 0; i < 8000; i++) keyfile.ReadLine ();
                data.LoadTextDatabase(fileName + "/", keyfile, DatabaseLoader.ProcessEnglishText, 1);
            }

            //Do some processing on the database
            foreach (DiscreteEventSeries <string> item in data.data)
            {
                string author = AsciiOnly(item.labels ["author"], false).RegexReplace(@"_+", @" ").RegexReplace(@"(?:[<])|(?:^[ ,])|(?:$)|(?:\')|(?:\\)", "").RegexReplace(@"([#$&])", @"\$1");
                author = manualRenames.GetWithDefault(author, author);

                if (author.StartsWith(@" "))                    //TODO: Why is this not caught by the regex?
                {
                    author = author.Substring(1);
                }
                if (invalidAuthors.Contains(author))
                {
                    //Console.WriteLine ("REMOVED " + author);
                    item.labels.Remove("author");
                }
                else
                {
                    item.labels ["author"] = NameCase(author);                     //Put the formatting done above back into db

                    string[] authSplit = author.Split(' ');
                    string   firstName = authSplit[0].ToLower();
                    if (titles.Contains(firstName) && authSplit.Length > 1)
                    {
                        if (authSplit.Length == 2)
                        {
                            //Just a last name.
                            firstName = "a";                             //Will be marked neutral.
                        }
                        else
                        {
                            firstName = authSplit[1];
                        }
                    }

                    if (neutralNames.Contains(firstName) || firstName.Length == 1)
                    {
                        //Gender unknown
                    }
                    else if (maleNames.Contains(firstName) || firstName.EndsWith("ndra"))
                    {
                        item.labels["gender"] = "male";
                    }
                    else if (firstName[firstName.Length - 1] == 'a' || firstName.EndsWith("ee") || femaleNames.Contains(firstName))
                    {
                        item.labels["gender"] = "female";
                    }
                    else if ("eiou".Contains(firstName[firstName.Length - 1]))
                    {
                        //Gender unknown (suspected female)
                    }
                    else if (firstName.Length > 1)
                    {
                        item.labels["gender"] = "male";
                    }
                }

                item.labels ["filename"] = item.labels ["filename"].Replace("_", " ").RegexReplace("([#$&])", "\\$1");
                if (item.labels.ContainsKey("location"))
                {
                    item.labels ["location"] = item.labels ["location"].Replace("_", " ").RegexReplace("([#$&])", "\\$1");
                    item.labels ["location"] = manualLocationRenames.GetWithDefault(item.labels["location"], item.labels["location"]);
                    item.labels ["location"] = NameCase(item.labels ["location"]);
                }
            }

            return(data);
        }