private static void Main(string[] args) { AppDomain.CurrentDomain.AssemblyResolve += CurrentDomain_AssemblyResolve; CorpusExplorerEcosystem.Initialize(new CacheStrategyDisableCaching()); System.Console.InputEncoding = System.Console.OutputEncoding = Configuration.Encoding; CultureInfo.DefaultThreadCurrentCulture = Thread.CurrentThread.CurrentCulture = CultureInfo.InvariantCulture; Execute(args); }
public QuickDemo() { Console.Write("INIT..."); CorpusExplorerEcosystem.InitializeMinimal(); if (!Directory.Exists("corpus")) { Directory.CreateDirectory("corpus"); } InitializeComponent(); #if DEBUG if (!File.Exists(_corpusPath)) { _corpusPath = "W:/eBooks-MFB/" + _corpusPath; } #endif if (File.Exists(_corpusPath)) { _quickIndex = new QuickIndex(_corpusPath); _dict = Serializer.Deserialize <Dictionary <Guid, string> >("corpus/data.bin"); } Console.WriteLine("OK!"); }
private static void Main(string[] args) { CorpusExplorerEcosystem.InitializeMinimal(); var fbd = new FolderBrowserDialog(); if (fbd.ShowDialog() != DialogResult.OK) { return; } var languages = new[] { "de", "en", "el" }; foreach (var language in languages) { var files = Directory.GetFiles(fbd.SelectedPath, $"tweets_{language}_*.sdd"); Console.WriteLine($"{language} with {files.Length} clusters"); var hashes = new HashSet <string>(); var tagger = new RawTextTagger { CorpusBuilder = new CorpusBuilderWriteDirect(), Tokenizer = new HighSpeedSpaceTokenizer() }; var all = 0; foreach (var file in files) { var sdd = Serializer.Deserialize <Dictionary <string, object>[]>(file); all += sdd.Length; using (var hash = SHA256.Create()) foreach (var x in sdd) { if (!x.ContainsKey("Text")) { continue; } try { var tHash = Convert.ToBase64String(hash.ComputeHash(Encoding.UTF8.GetBytes(x["Text"].ToString()))); if (hashes.Contains(tHash)) { continue; } hashes.Add(tHash); tagger.Input.Enqueue(x); } catch { //ignore } } } Console.WriteLine($"{language} has {all} tweets - {all - hashes.Count} copy-cats - {hashes.Count} original tweets"); tagger.Execute(); tagger.Output.First().Save(Path.Combine(fbd.SelectedPath, $"tweets_{language}.cec6")); Console.WriteLine(language + "...ok!"); } Console.WriteLine("! END !"); Console.ReadLine(); }
static void Main(string[] args) { CorpusExplorerEcosystem.InitializeMinimal(); var fbd = new FolderBrowserDialog { Description = "JSON-Folder" }; if (fbd.ShowDialog() != DialogResult.OK) { return; } var sfd = new FolderBrowserDialog { Description = "Output-Folder" }; if (sfd.ShowDialog() != DialogResult.OK) { return; } var packages = MakeFilePackages(Directory.GetFiles(fbd.SelectedPath, "*.json", SearchOption.TopDirectoryOnly)); Parallel.For( 0, packages.Count, i => { try { var package = packages[i]; var sdd_en = new List <Dictionary <string, object> >(); var sdd_de = new List <Dictionary <string, object> >(); var sdd_el = new List <Dictionary <string, object> >(); foreach (var file in package) { var scraper = new TwitterScraper(); scraper.Input.Enqueue(file); scraper.Execute(); var output = scraper.Output.ToArray(); var valid = false; foreach (var x in output) { if (!x.ContainsKey("Sprache")) { continue; } switch (x["Sprache"].ToString()) { case "en": sdd_en.Add(x); valid = true; break; case "de": sdd_de.Add(x); valid = true; break; case "el": sdd_el.Add(x); valid = true; break; } } if (valid) { continue; } try { File.Delete(file); } catch { } } Serializer.Serialize(sdd_en.ToArray(), Path.Combine(sfd.SelectedPath, $"tweets_en_{i:D3}.sdd"), true); Serializer.Serialize(sdd_de.ToArray(), Path.Combine(sfd.SelectedPath, $"tweets_de_{i:D3}.sdd"), true); Serializer.Serialize(sdd_el.ToArray(), Path.Combine(sfd.SelectedPath, $"tweets_el_{i:D3}.sdd"), true); } catch (Exception ex) { Console.WriteLine(ex.Message); Console.WriteLine("-----------"); Console.WriteLine(ex.StackTrace); Console.WriteLine("-----#-----"); } }); Console.WriteLine("! END !"); Console.ReadLine(); }
static void Main(string[] args) { CorpusExplorerEcosystem.InitializeMinimal(); var fbd = new FolderBrowserDialog(); if (fbd.ShowDialog() != DialogResult.OK) { return; } var files = Directory.GetFiles(fbd.SelectedPath, "*.html"); var valid = new List <string>(); // Die guten ins Töpfchen, die schlechten ins Kröpfchen foreach (var f in files) { if ((new FileInfo(f)).Length < 4096) { File.Delete(f); } else { valid.Add(f); } } var scrap = new List <Dictionary <string, object> >(); foreach (var f in valid) { var doc = new HtmlAgilityPack.HtmlDocument(); doc.Load(f, Configuration.Encoding); var res = new Dictionary <string, object> { { "Titel", doc.DocumentNode.SelectSingleNode("/html/head/meta[@property='og:title']")?.GetAttributeValue("content", "") }, { "Type", doc.DocumentNode.SelectSingleNode("/html/head/meta[@property='og:type']")?.GetAttributeValue("content", "") }, { "URL", doc.DocumentNode.SelectSingleNode("/html/head/meta[@property='og:url']")?.GetAttributeValue("content", "") }, { "Autor", doc.DocumentNode.SelectSingleNode("/html/body/main/div/div[@class='article-authors']")?.InnerText?.Replace("Von ", "") } }; var trans = doc.DocumentNode.SelectNodes("//em"); if (trans != null) { foreach (var n in trans) { var txt = n.InnerText; if (!txt.StartsWith("Aus dem Englischen von ")) { continue; } res.Add("Übersetzer", txt.Replace("Aus dem Englischen von ", "")); n.ParentNode.RemoveChild(n); break; } } DeleteNode(ref doc, "//a"); DeleteNode(ref doc, "/html/body/header"); DeleteNode(ref doc, "/html/body/main/div/section[@class='article-footnotes']"); DeleteNode(ref doc, "//div[@class='article-authors']"); DeleteNode(ref doc, "//div[@class='article-title header-title']"); DeleteNode(ref doc, "//div[@class='article-quotes']"); DeleteNode(ref doc, "//div[@class='article-sticky-image-group']"); DeleteNode(ref doc, "//div[@class='article-image']"); DeleteNode(ref doc, "//div[@class='article-image-single article-image-single-medium article-image-single-landscape']"); DeleteNode(ref doc, "//div[@class='article-image-single article-image-single-large article-image-single-landscape']"); DeleteNode(ref doc, "//div[@class='article-image-single article-image-single-large article-image-single-portrait']"); var main = doc.DocumentNode.SelectSingleNode("/html/body/main"); res.Add("Text", main.InnerText); scrap.Add(res); } Serializer.Serialize(scrap.ToArray(), "output.sdd", true); }