public static DiscreteSeriesDatabase <string> LoadRegionsDatabase(bool test = false, bool shorten = false, bool costarica = true, bool cuba = true) { //Load training data and create classifier. string directory = "../../res/regiones/"; string[] regions = "españa argentina méxico colombia".Split(' '); string file = ""; if (costarica) { regions = "costarica".Cons(regions).ToArray(); } if (cuba) { regions = "cuba".Cons(regions).ToArray(); } //string[] prefixes = new[]{"", "literatura", "historia", "lengua"}; //file += prefixes.Select (prefix => regions.FoldToString ((sum, val) => sum + "region" + ":" + val + ";" + "type" + ":" + "news" + " " + prefix + val, "", "", "\n")).FoldToString ("", "", "\n"); file += regions.Aggregate("", (sum, val) => sum + "region" + ":" + val + ";" + "type" + ":" + "news" + " " + val + "\n"); file += regions.Aggregate("", (sum, val) => sum + "region" + ":" + val + ";" + "type" + ":" + "wiki" + " " + "literatura" + val + "\n"); file += regions.Aggregate("", (sum, val) => sum + "region" + ":" + val + ";" + "type" + ":" + "wiki" + " " + "historia" + val + "\n"); file += regions.Aggregate("", (sum, val) => sum + "region" + ":" + val + ";" + "type" + ":" + "wiki" + " " + "lengua" + val + "\n"); file += regions.Aggregate("", (sum, val) => sum + "region" + ":" + val + ";" + "type" + ":" + "receta" + " " + "recetas" + val + "\n"); if (!test) { { string[] literatureRegions = "costarica costarica españa españa españa argentina argentina argentina argentina argentina argentina españa españa españa españa méxico méxico méxico méxico méxico méxico méxico colombia colombia colombia colombia colombia".Split(' '); string[] literatureNames = "leyendascr elisadelmar juanvaleraavuelaplumaespaña juanvaleraloscordobesesespaña marianela historiauniversal lamuerte buenosaires derroterosyviages fundaciondelaciudad laargentina mosenmillan historiadejudios viajosporespaña recuerdosybellezas leyendasmayas nahuatl laberinto comoaguaparachocolate mitoshorroresmexicanos leyendasmexicanas mitosurbanesmexicanos lamultituderrante viajoscolombianos leyendasurbanascolombianas mitoscolombianos mitoscolombianos2".Split(' '); IEnumerable <string> classesStrings = literatureRegions.Select(r => "region:" + r + ";" + "type:" + "literature"); file += classesStrings.Zip(literatureNames, (thisClasses, thisPath) => thisClasses + " " + thisPath).Aggregate(new StringBuilder(), (sum, val) => sum.Append(val).Append("\n")); } { string[] names = ( "salud antologia9 escorpionescr teca vacunoscr lanación universidadcr recetascostarica2 recetascostarica3 crcrawl presidentecostarica gobiernocostarica " + "arqueologiamaya poesiamexicana catolicismosocial unam mxcrawl cocrawl cocrawl2 desplazadoscolombianos mexicocnn méxicolgbt méxicogob historiaazteca historiaazteca2 " + "ordenamientoterretorrial competitividad ministerio" ).Split(' '); string[] tags = ( "region:costarica region:costarica region:costarica region:costarica region:costarica;type:paper region:costarica;type:news region:costarica region:costarica;type:receta region:costarica;type:receta region:costarica;type:website region:costarica;type:wiki region:costarica;type:wiki " + "region:méxico region:méxico;type:paper region:méxico;type:paper region:méxico;type:paper region:méxico;type:website region:colombia;type:website region:colombia;type:website region:colombia;type:wiki region:méxico;type:news region:méxico;type:brochure region:méxico;type:website region:méxico region:méxico " + "region:colombia region:colombia region:colombia" ).Split(' '); file += tags.Zip(names, (tag, name) => tag + " " + name).FoldToString("", "\n", "\n"); } } if (cuba) { file += "region:cuba;type:wiki cubaisla\n"; file += "region:cuba;type:receta recetascuba2\n"; file += "region:cuba;type:receta recetascuba3\n"; file += "region:cuba;type:literatura lahistoriame\n"; file += "region:cuba;type:literatura elencuentro\n"; } Console.WriteLine("Regions Database:"); Console.WriteLine(file); TextReader reader = new StringReader(file); DiscreteSeriesDatabase <string> d = new DiscreteSeriesDatabase <string> (); d.LoadTextDatabase(directory, reader, DatabaseLoader.ProcessSpanishText, 3); if (shorten) { d = new DiscreteSeriesDatabase <string>(d.Select(item => new DiscreteEventSeries <string>(item.labels, item.data.Take(750).ToArray()))); } return(d); }
public static DiscreteSeriesDatabase <string> getNewsDataset(string fileName, int count = 0) { DiscreteSeriesDatabase <string> data = new DiscreteSeriesDatabase <string> (); using (StreamReader keyfile = File.OpenText(fileName + "key")) { if (count > 0) { keyfile.BaseStream.Seek(-107 * count, System.IO.SeekOrigin.End); //avg line is ~81 characters. keyfile.ReadLine(); } // for(int i = 0; i < 8000; i++) keyfile.ReadLine (); data.LoadTextDatabase(fileName + "/", keyfile, DatabaseLoader.ProcessEnglishText, 1); } //Do some processing on the database foreach (DiscreteEventSeries <string> item in data.data) { string author = AsciiOnly(item.labels ["author"], false).RegexReplace(@"_+", @" ").RegexReplace(@"(?:[<])|(?:^[ ,])|(?:$)|(?:\')|(?:\\)", "").RegexReplace(@"([#$&])", @"\$1"); author = manualRenames.GetWithDefault(author, author); if (author.StartsWith(@" ")) //TODO: Why is this not caught by the regex? { author = author.Substring(1); } if (invalidAuthors.Contains(author)) { //Console.WriteLine ("REMOVED " + author); item.labels.Remove("author"); } else { item.labels ["author"] = NameCase(author); //Put the formatting done above back into db string[] authSplit = author.Split(' '); string firstName = authSplit[0].ToLower(); if (titles.Contains(firstName) && authSplit.Length > 1) { if (authSplit.Length == 2) { //Just a last name. firstName = "a"; //Will be marked neutral. } else { firstName = authSplit[1]; } } if (neutralNames.Contains(firstName) || firstName.Length == 1) { //Gender unknown } else if (maleNames.Contains(firstName) || firstName.EndsWith("ndra")) { item.labels["gender"] = "male"; } else if (firstName[firstName.Length - 1] == 'a' || firstName.EndsWith("ee") || femaleNames.Contains(firstName)) { item.labels["gender"] = "female"; } else if ("eiou".Contains(firstName[firstName.Length - 1])) { //Gender unknown (suspected female) } else if (firstName.Length > 1) { item.labels["gender"] = "male"; } } item.labels ["filename"] = item.labels ["filename"].Replace("_", " ").RegexReplace("([#$&])", "\\$1"); if (item.labels.ContainsKey("location")) { item.labels ["location"] = item.labels ["location"].Replace("_", " ").RegexReplace("([#$&])", "\\$1"); item.labels ["location"] = manualLocationRenames.GetWithDefault(item.labels["location"], item.labels["location"]); item.labels ["location"] = NameCase(item.labels ["location"]); } } return(data); }