private static void Main(string[] args)
 {
     AppDomain.CurrentDomain.AssemblyResolve += CurrentDomain_AssemblyResolve;
     CorpusExplorerEcosystem.Initialize(new CacheStrategyDisableCaching());
     System.Console.InputEncoding            = System.Console.OutputEncoding = Configuration.Encoding;
     CultureInfo.DefaultThreadCurrentCulture = Thread.CurrentThread.CurrentCulture = CultureInfo.InvariantCulture;
     Execute(args);
 }
Ejemplo n.º 2
0
        public QuickDemo()
        {
            Console.Write("INIT...");
            CorpusExplorerEcosystem.InitializeMinimal();
            if (!Directory.Exists("corpus"))
            {
                Directory.CreateDirectory("corpus");
            }
            InitializeComponent();
#if DEBUG
            if (!File.Exists(_corpusPath))
            {
                _corpusPath = "W:/eBooks-MFB/" + _corpusPath;
            }
#endif

            if (File.Exists(_corpusPath))
            {
                _quickIndex = new QuickIndex(_corpusPath);
                _dict       = Serializer.Deserialize <Dictionary <Guid, string> >("corpus/data.bin");
            }
            Console.WriteLine("OK!");
        }
Ejemplo n.º 3
0
        private static void Main(string[] args)
        {
            CorpusExplorerEcosystem.InitializeMinimal();

            var fbd = new FolderBrowserDialog();

            if (fbd.ShowDialog() != DialogResult.OK)
            {
                return;
            }

            var languages = new[] { "de", "en", "el" };

            foreach (var language in languages)
            {
                var files = Directory.GetFiles(fbd.SelectedPath, $"tweets_{language}_*.sdd");
                Console.WriteLine($"{language} with {files.Length} clusters");

                var hashes = new HashSet <string>();

                var tagger = new RawTextTagger
                {
                    CorpusBuilder = new CorpusBuilderWriteDirect(),
                    Tokenizer     = new HighSpeedSpaceTokenizer()
                };

                var all = 0;

                foreach (var file in files)
                {
                    var sdd = Serializer.Deserialize <Dictionary <string, object>[]>(file);
                    all += sdd.Length;

                    using (var hash = SHA256.Create())
                        foreach (var x in sdd)
                        {
                            if (!x.ContainsKey("Text"))
                            {
                                continue;
                            }

                            try
                            {
                                var tHash = Convert.ToBase64String(hash.ComputeHash(Encoding.UTF8.GetBytes(x["Text"].ToString())));
                                if (hashes.Contains(tHash))
                                {
                                    continue;
                                }
                                hashes.Add(tHash);

                                tagger.Input.Enqueue(x);
                            }
                            catch
                            { //ignore
                            }
                        }
                }

                Console.WriteLine($"{language} has {all} tweets - {all - hashes.Count} copy-cats - {hashes.Count} original tweets");

                tagger.Execute();
                tagger.Output.First().Save(Path.Combine(fbd.SelectedPath, $"tweets_{language}.cec6"));

                Console.WriteLine(language + "...ok!");
            }

            Console.WriteLine("! END !");
            Console.ReadLine();
        }
Ejemplo n.º 4
0
        static void Main(string[] args)
        {
            CorpusExplorerEcosystem.InitializeMinimal();

            var fbd = new FolderBrowserDialog {
                Description = "JSON-Folder"
            };

            if (fbd.ShowDialog() != DialogResult.OK)
            {
                return;
            }

            var sfd = new FolderBrowserDialog {
                Description = "Output-Folder"
            };

            if (sfd.ShowDialog() != DialogResult.OK)
            {
                return;
            }

            var packages = MakeFilePackages(Directory.GetFiles(fbd.SelectedPath, "*.json", SearchOption.TopDirectoryOnly));

            Parallel.For(
                0,
                packages.Count,
                i =>
            {
                try
                {
                    var package = packages[i];

                    var sdd_en = new List <Dictionary <string, object> >();
                    var sdd_de = new List <Dictionary <string, object> >();
                    var sdd_el = new List <Dictionary <string, object> >();

                    foreach (var file in package)
                    {
                        var scraper = new TwitterScraper();
                        scraper.Input.Enqueue(file);
                        scraper.Execute();

                        var output = scraper.Output.ToArray();

                        var valid = false;
                        foreach (var x in output)
                        {
                            if (!x.ContainsKey("Sprache"))
                            {
                                continue;
                            }

                            switch (x["Sprache"].ToString())
                            {
                            case "en":
                                sdd_en.Add(x);
                                valid = true;
                                break;

                            case "de":
                                sdd_de.Add(x);
                                valid = true;
                                break;

                            case "el":
                                sdd_el.Add(x);
                                valid = true;
                                break;
                            }
                        }

                        if (valid)
                        {
                            continue;
                        }

                        try
                        {
                            File.Delete(file);
                        }
                        catch { }
                    }

                    Serializer.Serialize(sdd_en.ToArray(), Path.Combine(sfd.SelectedPath, $"tweets_en_{i:D3}.sdd"), true);
                    Serializer.Serialize(sdd_de.ToArray(), Path.Combine(sfd.SelectedPath, $"tweets_de_{i:D3}.sdd"), true);
                    Serializer.Serialize(sdd_el.ToArray(), Path.Combine(sfd.SelectedPath, $"tweets_el_{i:D3}.sdd"), true);
                }
                catch (Exception ex)
                {
                    Console.WriteLine(ex.Message);
                    Console.WriteLine("-----------");
                    Console.WriteLine(ex.StackTrace);
                    Console.WriteLine("-----#-----");
                }
            });

            Console.WriteLine("! END !");
            Console.ReadLine();
        }
Ejemplo n.º 5
0
        static void Main(string[] args)
        {
            CorpusExplorerEcosystem.InitializeMinimal();

            var fbd = new FolderBrowserDialog();

            if (fbd.ShowDialog() != DialogResult.OK)
            {
                return;
            }

            var files = Directory.GetFiles(fbd.SelectedPath, "*.html");
            var valid = new List <string>();

            // Die guten ins Töpfchen, die schlechten ins Kröpfchen
            foreach (var f in files)
            {
                if ((new FileInfo(f)).Length < 4096)
                {
                    File.Delete(f);
                }
                else
                {
                    valid.Add(f);
                }
            }

            var scrap = new List <Dictionary <string, object> >();

            foreach (var f in valid)
            {
                var doc = new HtmlAgilityPack.HtmlDocument();
                doc.Load(f, Configuration.Encoding);
                var res = new Dictionary <string, object>
                {
                    { "Titel", doc.DocumentNode.SelectSingleNode("/html/head/meta[@property='og:title']")?.GetAttributeValue("content", "") },
                    { "Type", doc.DocumentNode.SelectSingleNode("/html/head/meta[@property='og:type']")?.GetAttributeValue("content", "") },
                    { "URL", doc.DocumentNode.SelectSingleNode("/html/head/meta[@property='og:url']")?.GetAttributeValue("content", "") },
                    { "Autor", doc.DocumentNode.SelectSingleNode("/html/body/main/div/div[@class='article-authors']")?.InnerText?.Replace("Von ", "") }
                };

                var trans = doc.DocumentNode.SelectNodes("//em");
                if (trans != null)
                {
                    foreach (var n in trans)
                    {
                        var txt = n.InnerText;
                        if (!txt.StartsWith("Aus dem Englischen von "))
                        {
                            continue;
                        }
                        res.Add("Übersetzer", txt.Replace("Aus dem Englischen von ", ""));
                        n.ParentNode.RemoveChild(n);
                        break;
                    }
                }

                DeleteNode(ref doc, "//a");

                DeleteNode(ref doc, "/html/body/header");
                DeleteNode(ref doc, "/html/body/main/div/section[@class='article-footnotes']");

                DeleteNode(ref doc, "//div[@class='article-authors']");
                DeleteNode(ref doc, "//div[@class='article-title header-title']");
                DeleteNode(ref doc, "//div[@class='article-quotes']");
                DeleteNode(ref doc, "//div[@class='article-sticky-image-group']");
                DeleteNode(ref doc, "//div[@class='article-image']");
                DeleteNode(ref doc, "//div[@class='article-image-single article-image-single-medium article-image-single-landscape']");
                DeleteNode(ref doc, "//div[@class='article-image-single article-image-single-large article-image-single-landscape']");
                DeleteNode(ref doc, "//div[@class='article-image-single article-image-single-large article-image-single-portrait']");

                var main = doc.DocumentNode.SelectSingleNode("/html/body/main");

                res.Add("Text", main.InnerText);
                scrap.Add(res);
            }

            Serializer.Serialize(scrap.ToArray(), "output.sdd", true);
        }