Beispiel #1
0
    public string Tag(string text, bool xmlOutput)
    {
        while (!Global.mReady)
        {
            Thread.Sleep(100);
        }
        Corpus corpus = new Corpus();

        corpus.LoadFromTextSsjTokenizer(text);
        int lemmaCorrect, lemmaCorrectLowercase, lemmaWords;

        Global.mPosTagger.Tag(corpus, out lemmaCorrect, out lemmaCorrectLowercase, out lemmaWords, /*xmlMode=*/ false);
        return(xmlOutput ? corpus.ToString("XML-MI") : corpus.ToString("TBL"));
    }
Beispiel #2
0
        static void Main(string[] args)
        {
            StringBuilder featuresTsv = new StringBuilder();
            StringBuilder compareTsv  = new StringBuilder();

            featuresTsv.AppendLine("Avtor,DRB,BI,HS,HL,DRL,BI-L,HS-L,HL-L,B/P,Zn./B,Zl./B,DKB,ARI,Flesch,Fog".Replace(",", "\t"));
            // setup logger
            Logger logger = Logger.GetRootLogger();

            logger.LocalLevel      = Logger.Level.Debug;
            logger.LocalOutputType = Logger.OutputType.Custom;
            Logger.CustomOutput    = delegate(string loggerName, Logger.Level level, string funcName, Exception e, string message, object[] msgArgs) {
                Console.WriteLine(message, msgArgs);
            };
            // load POS tagger models
            logger.Info("Main", "Nalagam modele za oblikoslovno analizo ...");
            PartOfSpeechTagger posTagger = new PartOfSpeechTagger(POS_TAGGER_MODEL, LEMMATIZER_MODEL);

            // load and preprocess texts
            logger.Info("Main", "Nalagam podatke ...");
            Dictionary <string, Author> authors = new Dictionary <string, Author>();

            DirectoryInfo[] authorDirs = new DirectoryInfo(DATA_FOLDER).GetDirectories();//.Take(3).ToArray();
            foreach (DirectoryInfo authorDir in authorDirs)
            {
                string authorName     = authorDir.Name;
                bool   isTaggedAuthor = authorName.Equals(UNKNOWN_AUTHOR, StringComparison.OrdinalIgnoreCase);
                logger.Info("Main", "Obravnavam avtorja \"" + authorName + "\" ...");
                FileInfo[] authorFiles = authorDir.GetFiles("*.txt");
                foreach (FileInfo authorFile in authorFiles)
                {
                    string txt   = File.ReadAllText(authorFile.FullName, Encoding.GetEncoding(DATA_ENCODING));
                    Match  m     = Regex.Match(txt, "^(.*?)(\r)?\n");
                    string title = m.Result("$1").Trim();
                    logger.Info("Main", "Obravnavam članek \"" + title + "\" ...");
                    // preprocess text
                    Corpus corpus = new Corpus();
                    corpus.LoadFromTextSsjTokenizer(txt);
                    posTagger.Tag(corpus);
                    Text text = new Text(corpus, title, authorName);
                    text.mIsTagged = isTaggedAuthor;
                    Author author;
                    if (!authors.TryGetValue(text.mAuthor, out author))
                    {
                        author           = new Author(text.mAuthor);
                        author.mIsTagged = isTaggedAuthor;
                        author.mTexts.Add(text);
                        authors.Add(text.mAuthor, author);
                    }
                    else
                    {
                        author.mTexts.Add(text);
                    }
                }
            }
            FunctionWordsModel fuw = new FunctionWordsModel();

            fuw.Initialize(authors.Values);
            FrequentWordsModel frw = new FrequentWordsModel();

            frw.Initialize(authors.Values);
            FrequentLemmasModel frl = new FrequentLemmasModel();

            frl.Initialize(authors.Values);
            CharNGramsModel cng = new CharNGramsModel();

            cng.Initialize(authors.Values);
            PosTagsModel pos = new PosTagsModel();

            pos.Initialize(authors.Values);
            string[]    vecNames = new string[] { "fuw", "frw", "frl", "cng", "pos" };
            ModelBase[] modelBs  = new ModelBase[] { fuw, frw, frl, cng, pos };
            foreach (Author author in authors.Values)
            {
                author.ComputeFeatures();
                for (int i = 0; i < vecNames.Length; i++)
                {
                    author.mPredictions.Add(vecNames[i], modelBs[i].mModels[author.mName].Predict(author.mFeatureVectors[vecNames[i]]));
                }
            }

            ArrayList <Author> authorsArray = new ArrayList <Author>(authors.Values);

            foreach (Author author in authorsArray)
            {
                foreach (string feature in new ArrayList <string>(author.mFeatures.Keys))
                {
                    Features.GetFeatureRanking(author, authorsArray, feature);
                }
            }


            // write results
            logger.Info("Main", "Pišem rezultate ...");
            foreach (string resName in "bootstrap.min.css,bootstrap.min.js,code.js,jquery.js,jquery.tablesorter.min.js,sort_asc.png,sort_both.png,sort_desc.png,styles.css".Split(','))
            {
                CopyToOutput(resName, OUTPUT_PATH);
            }
            using (StreamWriter wIdx = new StreamWriter(OUTPUT_PATH + "\\index.html", /*append=*/ false, Encoding.UTF8))
            {
                WriteHeader(wIdx);
                wIdx.WriteLine("<h1>Rezultati analize</h1>");
                int authorNum = 0;
                foreach (KeyValuePair <string, Author> item in authors)
                {
                    authorNum++;
                    Author author = item.Value;
                    wIdx.WriteLine("<h2>Avtor: {0}</h2>", HttpUtility.HtmlEncode(item.Key));
                    featuresTsv.Append(item.Key);
                    if (author.mIsTagged)
                    {
                        wIdx.WriteLine("<div class='alert alert-info'><strong>Neznani avtor.</strong> <a href='{0}'>Primerjaj z ostalimi avtorji »</a></div>", "compare_" + authorNum + ".html");
                    }
                    else
                    {
                        wIdx.WriteLine("<p><a href='{0}'>Primerjaj z ostalimi avtorji »</a></p>", "compare_" + authorNum + ".html");
                    }
                    wIdx.WriteLine("<h3>Besedila</h3>");
                    wIdx.WriteLine("<ul>");
                    foreach (Text text in item.Value.mTexts)
                    {
                        wIdx.WriteLine("<li><a href='{1}'>{0} »</a></li>", HttpUtility.HtmlEncode(text.mName), text.mHtmlFileName);
                        using (StreamWriter wDoc = new StreamWriter(OUTPUT_PATH + "\\" + text.mHtmlFileName, /*append=*/ false, Encoding.UTF8))
                        {
                            // write document HTML
                            WriteHeader(wDoc);
                            wDoc.WriteLine("<div class='back'><a href='index.html'>« Seznam avtorjev</a></div>");
                            wDoc.WriteLine("<h1>Besedilo</h1>");
                            wDoc.WriteLine("<h2>{0}</h2>", HttpUtility.HtmlEncode(text.mName));
                            wDoc.WriteLine(text.GetHtml());
                            wDoc.WriteLine("<h1>Značilke</h1>");
                            wDoc.WriteLine("<h2>Obseg besedišča</h2>");
                            wDoc.WriteLine("<table class='table table-bordered table-striped'>");
                            wDoc.WriteLine("<thead>");
                            wDoc.WriteLine("<tr><th>Značilka</th><th>Vrednost</th></tr>");
                            wDoc.WriteLine("</thead>");
                            wDoc.WriteLine("<tbody>");
                            foreach (string featureNm in "ttr,brunet,honore,hl,ttrLemma,brunetLemma,honoreLemma,hlLemma".Split(','))
                            {
                                WriteFeature(wDoc, featureNm, text.mFeatures[featureNm]);
                            }
                            wDoc.WriteLine("</tbody>");
                            wDoc.WriteLine("</table>");
                            wDoc.WriteLine("<h2>Berljivost</h2>");
                            wDoc.WriteLine("<table class='table table-bordered table-striped'>");
                            wDoc.WriteLine("<thead>");
                            wDoc.WriteLine("<tr><th>Značilka</th><th>Vrednost</th></tr>");
                            wDoc.WriteLine("</thead>");
                            wDoc.WriteLine("<tbody>");
                            foreach (string featureNm in "rWords,rChars,rSyllables,rComplex,ari,flesch,fog".Split(','))
                            {
                                WriteFeature(wDoc, featureNm, text.mFeatures[featureNm]);
                            }
                            wDoc.WriteLine("</tbody>");
                            wDoc.WriteLine("</table>");
                            wDoc.WriteLine("<h2>Funkcijske besede</h2>");
                            wDoc.WriteLine("<p><a href='javascript:void(0)' data-toggle='collapse' data-target='#fuw'>Seznam funkcijskih besed</a></p>");
                            wDoc.WriteLine("<div id='fuw' class='collapse'>");
                            wDoc.WriteLine("<table class='table table-bordered table-striped'>");
                            wDoc.WriteLine("<thead>");
                            wDoc.WriteLine("<tr><th>Zap. št.</th><th>Beseda</th><th>Utež</th></tr>");
                            wDoc.WriteLine("</thead>");
                            wDoc.WriteLine("<tbody>");
                            int i = 0;
                            foreach (KeyDat <double, Word> wordInfo in fuw.mBowSpace.GetKeywords(text.mFeatureVectors["fuw"]).Take(TOP_ITEMS_COUNT))
                            {
                                wDoc.WriteLine("<tr><td>{0}.</td><td>{1}</td><td>{2:0.00}</td></tr>", ++i, HttpUtility.HtmlEncode(wordInfo.Dat.Stem), wordInfo.Key);
                            }
                            wDoc.WriteLine("</tbody>");
                            wDoc.WriteLine("</table>");
                            wDoc.WriteLine("</div>");
                            wDoc.WriteLine("<h2>Pogoste besede</h2>");
                            wDoc.WriteLine("<p><a href='javascript:void(0)' data-toggle='collapse' data-target='#frw'>Seznam pogostih besed</a></p>");
                            wDoc.WriteLine("<div id='frw' class='collapse'>");
                            wDoc.WriteLine("<table class='table table-bordered table-striped'>");
                            wDoc.WriteLine("<thead>");
                            wDoc.WriteLine("<tr><th>Zap. št.</th><th>Beseda</th><th>Utež</th></tr>");
                            wDoc.WriteLine("</thead>");
                            wDoc.WriteLine("<tbody>");
                            i = 0;
                            foreach (KeyDat <double, Word> wordInfo in frw.mBowSpace.GetKeywords(text.mFeatureVectors["frw"]).Take(TOP_ITEMS_COUNT))
                            {
                                wDoc.WriteLine("<tr><td>{0}.</td><td>{1}</td><td>{2:0.00}</td></tr>", ++i, HttpUtility.HtmlEncode(wordInfo.Dat.Stem), wordInfo.Key);
                            }
                            wDoc.WriteLine("</tbody>");
                            wDoc.WriteLine("</table>");
                            wDoc.WriteLine("</div>");
                            wDoc.WriteLine("<h2>Pogoste leme</h2>");
                            wDoc.WriteLine("<p><a href='javascript:void(0)' data-toggle='collapse' data-target='#frl'>Seznam pogostih lem</a></p>");
                            wDoc.WriteLine("<div id='frl' class='collapse'>");
                            wDoc.WriteLine("<table class='table table-bordered table-striped'>");
                            wDoc.WriteLine("<thead>");
                            wDoc.WriteLine("<tr><th>Zap. št.</th><th>Lema</th><th>Utež</th></tr>");
                            wDoc.WriteLine("</thead>");
                            wDoc.WriteLine("<tbody>");
                            i = 0;
                            foreach (KeyDat <double, Word> wordInfo in frl.mBowSpace.GetKeywords(text.mFeatureVectors["frl"]).Take(TOP_ITEMS_COUNT))
                            {
                                wDoc.WriteLine("<tr><td>{0}.</td><td>{1}</td><td>{2:0.00}</td></tr>", ++i, HttpUtility.HtmlEncode(wordInfo.Dat.Stem), wordInfo.Key);
                            }
                            wDoc.WriteLine("</tbody>");
                            wDoc.WriteLine("</table>");
                            wDoc.WriteLine("</div>");
                            wDoc.WriteLine("<h2>Znakovna zaporedja</h2>");
                            wDoc.WriteLine("<p><a href='javascript:void(0)' data-toggle='collapse' data-target='#cng'>Seznam znakovnih zaporedij</a></p>");
                            wDoc.WriteLine("<div id='cng' class='collapse'>");
                            wDoc.WriteLine("<table class='table table-bordered table-striped'>");
                            wDoc.WriteLine("<thead>");
                            wDoc.WriteLine("<tr><th>Zap. št.</th><th>Zaporedje</th><th>Utež</th></tr>");
                            wDoc.WriteLine("</thead>");
                            wDoc.WriteLine("<tbody>");
                            i = 0;
                            foreach (KeyDat <double, Word> wordInfo in cng.mBowSpace.GetKeywords(text.mFeatureVectors["cng"]).Take(TOP_ITEMS_COUNT))
                            {
                                wDoc.WriteLine("<tr><td>{0}.</td><td>{1}</td><td>{2:0.00}</td></tr>", ++i, HttpUtility.HtmlEncode(wordInfo.Dat.Stem), wordInfo.Key);
                            }
                            wDoc.WriteLine("</tbody>");
                            wDoc.WriteLine("</table>");
                            wDoc.WriteLine("</div>");
                            wDoc.WriteLine("<h2>Oblikoslovne oznake</h2>");
                            wDoc.WriteLine("<p><a href='javascript:void(0)' data-toggle='collapse' data-target='#pos'>Seznam oblikoslovnih oznak</a></p>");
                            wDoc.WriteLine("<div id='pos' class='collapse'>");
                            wDoc.WriteLine("<table class='table table-bordered table-striped'>");
                            wDoc.WriteLine("<thead>");
                            wDoc.WriteLine("<tr><th>Zap. št.</th><th>Zaporedje</th><th>Utež</th></tr>");
                            wDoc.WriteLine("</thead>");
                            wDoc.WriteLine("<tbody>");
                            i = 0;
                            foreach (KeyDat <double, Word> wordInfo in pos.mBowSpace.GetKeywords(text.mFeatureVectors["pos"]).Take(TOP_ITEMS_COUNT))
                            {
                                wDoc.WriteLine("<tr><td>{0}.</td><td>{1}</td><td>{2:0.00}</td></tr>", ++i, HttpUtility.HtmlEncode(wordInfo.Dat.Stem), wordInfo.Key);
                            }
                            wDoc.WriteLine("</tbody>");
                            wDoc.WriteLine("</table>");
                            wDoc.WriteLine("</div>");
                            WriteFooter(wDoc);
                        }
                    }
                    wIdx.WriteLine("</ul>");
                    wIdx.WriteLine("<h3>Značilke</h3>");
                    wIdx.WriteLine("<h4>Obseg besedišča</h4>");
                    wIdx.WriteLine("<table class='table table-bordered table-striped'>");
                    wIdx.WriteLine("<thead>");
                    wIdx.WriteLine("<tr><th>Značilka</th><th>Vrednost</th><th>Std. odklon</th></tr>");
                    wIdx.WriteLine("</thead>");
                    wIdx.WriteLine("<tbody>");
                    foreach (string featureNm in "ttr,brunet,honore,hl,ttrLemma,brunetLemma,honoreLemma,hlLemma".Split(','))
                    {
                        WriteFeature(wIdx, featureNm, author.GetAvg(featureNm), author.GetStdDev(featureNm));
                        featuresTsv.Append("\t" + author.GetAvg(featureNm));
                    }
                    wIdx.WriteLine("</tbody>");
                    wIdx.WriteLine("</table>");
                    wIdx.WriteLine("<h4>Berljivost</h4>");
                    wIdx.WriteLine("<table class='table table-bordered table-striped'>");
                    wIdx.WriteLine("<thead>");
                    wIdx.WriteLine("<tr><th>Značilka</th><th>Vrednost</th><th>Std. odklon</th></tr>");
                    wIdx.WriteLine("</thead>");
                    wIdx.WriteLine("<tbody>");
                    foreach (string featureNm in "rWords,rChars,rSyllables,rComplex,ari,flesch,fog".Split(','))
                    {
                        WriteFeature(wIdx, featureNm, author.GetAvg(featureNm), author.GetStdDev(featureNm));
                        featuresTsv.Append("\t" + author.GetAvg(featureNm));
                    }
                    featuresTsv.AppendLine();
                    wIdx.WriteLine("</tbody>");
                    wIdx.WriteLine("</table>");
                    if (!author.mIsTagged)
                    {
                        wIdx.WriteLine("<h4>Razlikovalna moč značilk</h4>");
                        wIdx.WriteLine("<table class='tablesorter table table-bordered table-striped'>");
                        wIdx.WriteLine("<thead>");
                        wIdx.WriteLine("<tr><th>Značilka</th><th>Utež</th></tr>");
                        wIdx.WriteLine("</thead>");
                        wIdx.WriteLine("<tbody>");
                        foreach (string pKey in author.mFeatures.Keys.Where(x => x.StartsWith("p_")))
                        {
                            WriteFeature(wIdx, pKey.Substring(2), author.GetAvg(pKey), author.GetStdDev(pKey), /*sameCell=*/ true);
                        }
                        wIdx.WriteLine("</tbody>");
                        wIdx.WriteLine("</table>");
                    }
                    wIdx.WriteLine("<h4>Funkcijske besede</h4>");
                    wIdx.WriteLine("<p><a href='javascript:void(0)' data-toggle='collapse' data-target='#fuw_{0}'>Seznam funkcijskih besed</a></p>", authorNum);
                    wIdx.WriteLine("<div id='fuw_{0}' class='collapse'>", authorNum);
                    wIdx.WriteLine("<table class='table table-bordered table-striped'>");
                    wIdx.WriteLine("<thead>");
                    wIdx.WriteLine("<tr><th>Zap. št.</th><th>Beseda</th><th>Utež</th></tr>");
                    wIdx.WriteLine("</thead>");
                    wIdx.WriteLine("<tbody>");
                    int j = 0;
                    foreach (Pair <string, double> word in author.GetTopVectorItems("fuw", TOP_ITEMS_COUNT, fuw.mBowSpace))
                    {
                        wIdx.WriteLine("<tr><td>{0}.</td><td>{1}</td><td>{2:0.00}</td></tr>", ++j, HttpUtility.HtmlEncode(word.First), word.Second);
                    }
                    wIdx.WriteLine("</tbody>");
                    wIdx.WriteLine("</table>");
                    wIdx.WriteLine("</div>");
                    wIdx.WriteLine("<h4>Pogoste besede</h4>");
                    wIdx.WriteLine("<p><a href='javascript:void(0)' data-toggle='collapse' data-target='#frw_{0}'>Seznam pogostih besed</a></p>", authorNum);
                    wIdx.WriteLine("<div id='frw_{0}' class='collapse'>", authorNum);
                    wIdx.WriteLine("<table class='table table-bordered table-striped'>");
                    wIdx.WriteLine("<thead>");
                    wIdx.WriteLine("<tr><th>Zap. št.</th><th>Beseda</th><th>Utež</th></tr>");
                    wIdx.WriteLine("</thead>");
                    wIdx.WriteLine("<tbody>");
                    j = 0;
                    foreach (Pair <string, double> word in author.GetTopVectorItems("frw", TOP_ITEMS_COUNT, frw.mBowSpace))
                    {
                        wIdx.WriteLine("<tr><td>{0}.</td><td>{1}</td><td>{2:0.00}</td></tr>", ++j, HttpUtility.HtmlEncode(word.First), word.Second);
                    }
                    wIdx.WriteLine("</tbody>");
                    wIdx.WriteLine("</table>");
                    wIdx.WriteLine("</div>");
                    wIdx.WriteLine("<h4>Pogoste leme</h4>");
                    wIdx.WriteLine("<p><a href='javascript:void(0)' data-toggle='collapse' data-target='#frl_{0}'>Seznam pogostih lem</a></p>", authorNum);
                    wIdx.WriteLine("<div id='frl_{0}' class='collapse'>", authorNum);
                    wIdx.WriteLine("<table class='table table-bordered table-striped'>");
                    wIdx.WriteLine("<thead>");
                    wIdx.WriteLine("<tr><th>Zap. št.</th><th>Beseda</th><th>Utež</th></tr>");
                    wIdx.WriteLine("</thead>");
                    wIdx.WriteLine("<tbody>");
                    j = 0;
                    foreach (Pair <string, double> word in author.GetTopVectorItems("frl", TOP_ITEMS_COUNT, frl.mBowSpace))
                    {
                        wIdx.WriteLine("<tr><td>{0}.</td><td>{1}</td><td>{2:0.00}</td></tr>", ++j, HttpUtility.HtmlEncode(word.First), word.Second);
                    }
                    wIdx.WriteLine("</tbody>");
                    wIdx.WriteLine("</table>");
                    wIdx.WriteLine("</div>");
                    wIdx.WriteLine("<h4>Znakovna zaporedja</h4>");
                    wIdx.WriteLine("<p><a href='javascript:void(0)' data-toggle='collapse' data-target='#cng_{0}'>Seznam znakovnih zaporedij</a></p>", authorNum);
                    wIdx.WriteLine("<div id='cng_{0}' class='collapse'>", authorNum);
                    wIdx.WriteLine("<table class='table table-bordered table-striped'>");
                    wIdx.WriteLine("<thead>");
                    wIdx.WriteLine("<tr><th>Zap. št.</th><th>Zaporedje</th><th>Utež</th></tr>");
                    wIdx.WriteLine("</thead>");
                    wIdx.WriteLine("<tbody>");
                    j = 0;
                    foreach (Pair <string, double> word in author.GetTopVectorItems("cng", TOP_ITEMS_COUNT, cng.mBowSpace))
                    {
                        wIdx.WriteLine("<tr><td>{0}.</td><td>{1}</td><td>{2:0.00}</td></tr>", ++j, HttpUtility.HtmlEncode(word.First), word.Second);
                    }
                    wIdx.WriteLine("</tbody>");
                    wIdx.WriteLine("</table>");
                    wIdx.WriteLine("</table>");
                    wIdx.WriteLine("</div>");
                    wIdx.WriteLine("<h4>Oblikoslovne oznake</h4>");
                    wIdx.WriteLine("<p><a href='javascript:void(0)' data-toggle='collapse' data-target='#pos_{0}'>Seznam oblikoslovnih oznak</a></p>", authorNum);
                    wIdx.WriteLine("<div id='pos_{0}' class='collapse'>", authorNum);
                    wIdx.WriteLine("<table class='table table-bordered table-striped'>");
                    wIdx.WriteLine("<thead>");
                    wIdx.WriteLine("<tr><th>Zap. št.</th><th>Zaporedje</th><th>Utež</th></tr>");
                    wIdx.WriteLine("</thead>");
                    wIdx.WriteLine("<tbody>");
                    j = 0;
                    foreach (Pair <string, double> word in author.GetTopVectorItems("pos", TOP_ITEMS_COUNT, pos.mBowSpace))
                    {
                        wIdx.WriteLine("<tr><td>{0}.</td><td>{1}</td><td>{2:0.00}</td></tr>", ++j, HttpUtility.HtmlEncode(word.First), word.Second);
                    }
                    wIdx.WriteLine("</tbody>");
                    wIdx.WriteLine("</table>");
                    wIdx.WriteLine("</div>");
                }
                WriteFooter(wIdx);
            }
            // write author-compare pages
            int n = 0;

            foreach (Author author in authors.Values)
            {
                string authorCompareFileName = OUTPUT_PATH + "\\compare_" + ++n + ".html";
                using (StreamWriter wAuthorCmp = new StreamWriter(authorCompareFileName, /*append=*/ false, Encoding.UTF8))
                {
                    WriteHeader(wAuthorCmp);
                    wAuthorCmp.WriteLine("<div class='back'><a href='index.html'>« Seznam avtorjev</a></div>");
                    wAuthorCmp.WriteLine("<h1>Primerjava</h1>");
                    wAuthorCmp.WriteLine("<h2>Avtor: {0}</h2>", HttpUtility.HtmlEncode(author.mName));
                    wAuthorCmp.WriteLine("<h3>Obseg besedišča</h3>");
                    wAuthorCmp.WriteLine("<table class='tablesorter table table-bordered table-striped'>");
                    wAuthorCmp.WriteLine("<thead>");
                    wAuthorCmp.WriteLine("<tr><th>Avtor</th><th>DRB</th><th>BI</th><th>HS</th><th>HL</th><th>DRL</th><th>BI-L</th><th>HS-L</th><th>HL-L</th></tr>");
                    wAuthorCmp.WriteLine("</thead>");
                    wAuthorCmp.WriteLine("<tbody>");
                    WriteAuthorCompareTable(wAuthorCmp, authors.Values, author, "ttr,brunet,honore,hl,ttrLemma,brunetLemma,honoreLemma,hlLemma".Split(','), /*isVec=*/ false);
                    wAuthorCmp.WriteLine("</tbody>");
                    wAuthorCmp.WriteLine("</table>");
                    wAuthorCmp.WriteLine("<h3>Berljivost</h3>");
                    wAuthorCmp.WriteLine("<table class='tablesorter table table-bordered table-striped'>");
                    wAuthorCmp.WriteLine("<thead>");
                    wAuthorCmp.WriteLine("<tr><th>Avtor</th><th>B/P</th><th>Zn./B</th><th>Zl./B</th><th>DKB</th><th>ARI</th><th>Flesch</th><th>Fog</th></tr>");
                    wAuthorCmp.WriteLine("</thead>");
                    wAuthorCmp.WriteLine("<tbody>");
                    WriteAuthorCompareTable(wAuthorCmp, authors.Values, author, "rWords,rChars,rSyllables,rComplex,ari,flesch,fog".Split(','), /*isVec=*/ false);
                    wAuthorCmp.WriteLine("</tbody>");
                    wAuthorCmp.WriteLine("</table>");
                    wAuthorCmp.WriteLine("<h3>Vektorji značilk</h3>");
                    wAuthorCmp.WriteLine("<table class='tablesorter table table-bordered table-striped'>");
                    wAuthorCmp.WriteLine("<thead>");
                    wAuthorCmp.WriteLine("<tr><th>Avtor</th><th>FB</th><th>PB</th><th>PL</th><th>ZZ</th><th>Ozn.</th></tr>");
                    wAuthorCmp.WriteLine("</thead>");
                    wAuthorCmp.WriteLine("<tbody>");
                    WriteAuthorCompareTable(wAuthorCmp, authors.Values, author, "fuw,frw,frl,cng,pos".Split(','), /*isVec=*/ true);
                    wAuthorCmp.WriteLine("</tbody>");
                    wAuthorCmp.WriteLine("</table>");
                    WriteFooter(wAuthorCmp);
                    // write to TSV
                    compareTsv.AppendLine();
                    compareTsv.AppendLine(author.mName);
                    compareTsv.AppendLine("Avtor,DRB,BI,HS,HL,DRL,BI-L,HS-L,HL-L,B/P,Zn./B,Zl./B,DKB,ARI,Flesch,Fog,FB,PB,PL,ZZ,Ozn.".Replace(",", "\t"));
                    WriteAuthorCompareTable(compareTsv, authors.Values, author, "ttr,brunet,honore,hl,ttrLemma,brunetLemma,honoreLemma,hlLemma,rWords,rChars,rSyllables,rComplex,ari,flesch,fog,fuw,frw,frl,cng,pos".Split(','));
                }
            }
            using (StreamWriter wTsv = new StreamWriter(OUTPUT_FILE, /*append=*/ false, Encoding.UTF8))
            {
                wTsv.Write(featuresTsv.ToString());
                wTsv.Write(compareTsv.ToString());
            }
        }
Beispiel #3
0
        static void Main(string[] args)
        {
            Console.WriteLine("Nalagam meta-podatke o blogih...");
            LoadBlogMetaData();
            Console.WriteLine("Nalagam oznacevalnik...");
            PartOfSpeechTagger posTagger = new PartOfSpeechTagger(Config.PosTaggerModel, Config.LemmatizerModel);
            string             pattern   = "*.xml";

            if (args.Length > 0)
            {
                pattern = args[0];
            }
            Queue <string> fileNames = new Queue <string>(Directory.GetFiles(Config.DataFolder, pattern));

            while (fileNames.Count > 0)
            {
                int         n       = 0;
                XmlDocument fullDoc = null;
                while (fileNames.Count > 0 && n < Config.BatchSize)
                {
                    string fileName = fileNames.Dequeue();
                    if (File.Exists(MakeOutputFileName(fileName)))
                    {
                        Console.WriteLine("Ze obdelano: {0}.", fileName);
                        continue;
                    }
                    if (File.Exists(MakeOutputFileName(fileName) + ".locked"))
                    {
                        Console.WriteLine("Zaklenjeno: {0}.", fileName);
                        continue;
                    }
                    // load text
                    Console.WriteLine("Datoteka: {0}...", fileName);
                    XmlDocument tmpDoc = new XmlDocument();
                    string      xml    = File.ReadAllText(fileName);
                    xml = xml.Replace("// ]]>", "").Replace("//--><!]]>", "");
                    tmpDoc.LoadXml(xml);
                    string text = tmpDoc.SelectSingleNode("//besedilo").InnerText;
                    if (text.Trim() == "") // *** empty documents are ignored
                    {
                        Console.WriteLine("*** Datoteka ne vsebuje besedila.");
                        continue;
                    }
                    Corpus corpus = new Corpus();
                    corpus.LoadFromTextSsjTokenizer(text);
                    // tag text
                    Console.WriteLine("Oznacujem besedilo...");
                    posTagger.Tag(corpus);
                    XmlDocument doc = new XmlDocument();
                    doc.LoadXml(corpus.ToString("XML-MI").Replace("xmlns=\"http://www.tei-c.org/ns/1.0\"", "")); // *** remove this f***ing namespace
                    ((XmlElement)doc.SelectSingleNode("//text")).SetAttribute("fileName", fileName);
                    // append text to fullDoc
                    if (fullDoc == null)
                    {
                        fullDoc = doc;
                    }
                    else
                    {
                        XmlDocumentFragment xmlFrag = fullDoc.CreateDocumentFragment();
                        xmlFrag.InnerXml = doc.SelectSingleNode("//text").OuterXml;
                        fullDoc.DocumentElement.AppendChild(xmlFrag);
                    }
                    n++;
                    // check if meta-data exists
                    //string key = tmpDoc.SelectSingleNode("//header/blog").InnerText;
                    //if (!mBlogMetaData.ContainsKey(key))
                    //{
                    //    Console.WriteLine("*** Cannot find meta-data for " + key);
                    //    return;
                    //}
                }
                // nothing to do?
                if (fullDoc == null)
                {
                    continue;
                }
                // save tagged text for parsing
                Console.WriteLine("Pripravljam datoteke za razclenjevanje...");
                Guid              tmpId          = Guid.NewGuid();
                string            tmpFileNameIn  = new FileInfo(Config.TmpFolder + "\\" + tmpId.ToString("N") + ".tmp").FullName;
                string            tmpFileNameOut = new FileInfo(Config.TmpFolder + "\\" + tmpId.ToString("N") + ".out.tmp").FullName;
                XmlWriterSettings xmlSettings    = new XmlWriterSettings();
                xmlSettings.Encoding = Encoding.UTF8;
                xmlSettings.Indent   = true;
                using (XmlWriter w = XmlWriter.Create(tmpFileNameIn, xmlSettings))
                {
                    fullDoc.Save(w);
                }
                // parse text
                Console.WriteLine("Zaganjam razclenjevalnik...");
                Parser.Parse(tmpFileNameIn, tmpFileNameOut);
                // load results
                if (!File.Exists(tmpFileNameOut))
                {
                    // lock files and continue
                    Console.WriteLine("*** Prislo je do napake pri razclenjevanju. Nadaljujem z obdelavo.");
                    fullDoc.SelectNodes("//text").Cast <XmlElement>().ToList().ForEach(x => LockFile(x.Attributes["fileName"].Value));
                    continue;
                }
                fullDoc = new XmlDocument();
                fullDoc.Load(tmpFileNameOut);
                // create output files
                Console.WriteLine("Pisem izhodne datoteke...");
                foreach (XmlNode txtNode in fullDoc.SelectNodes("//text"))
                {
                    string fileName = txtNode.Attributes["fileName"].Value;
                    ((XmlElement)txtNode).RemoveAttribute("fileName");
                    Console.WriteLine("Datoteka: {0}...", fileName);
                    XmlDocument tmpDoc = new XmlDocument();
                    string      xml    = File.ReadAllText(fileName);
                    xml = xml.Replace("// ]]>", "").Replace("//--><!]]>", "");
                    tmpDoc.LoadXml(xml);
                    // insert input XML into TEI-XML
                    XmlDocument doc = new XmlDocument();
                    doc.LoadXml("<TEI>" + txtNode.OuterXml + "</TEI>");
                    XmlDocumentFragment docPart = doc.CreateDocumentFragment();
                    docPart.InnerXml = tmpDoc.OuterXml;
                    doc.DocumentElement.PrependChild(docPart);
                    // insert blog meta-data
                    string       key = doc.SelectSingleNode("//header/blog").InnerText;
                    BlogMetaData metaData;
                    if (!mBlogMetaData.ContainsKey(key))
                    {
                        Console.WriteLine("*** Ne najdem podatkov o blogu \"{0}\".", key);
                        continue;
                    }
                    else
                    {
                        Console.WriteLine("Vstavljam meta-podatke o blogu...");
                        metaData = mBlogMetaData[key];
                        XmlNode node = doc.SelectSingleNode("//header");
                        node.AppendChild(doc.CreateElement("blogSpletniNaslov")).InnerText = metaData.mBlogUrl;
                        node.AppendChild(doc.CreateElement("blogNaslov")).InnerText        = metaData.mBlogTitle;
                        node.AppendChild(doc.CreateElement("blogNaslovKratek")).InnerText  = metaData.mBlogTitleShort;
                        //node.AppendChild(doc.CreateElement("avtorEMail")).InnerText = metaData.mAuthorEMail;
                        node.AppendChild(doc.CreateElement("avtorSpol")).InnerText      = metaData.mAuthorGender;
                        node.AppendChild(doc.CreateElement("avtorStarost")).InnerText   = metaData.mAuthorAge;
                        node.AppendChild(doc.CreateElement("avtorRegija")).InnerText    = metaData.mAuthorLocation;
                        node.AppendChild(doc.CreateElement("avtorIzobrazba")).InnerText = metaData.mAuthorEducation;
                    }
                    // write results
                    Console.WriteLine("Zapisujem rezultate...");
                    using (XmlWriter w = XmlWriter.Create(MakeOutputFileName(fileName), xmlSettings))
                    {
                        doc.Save(w);
                    }
                }
            }
            // purge temp folder
            Directory.GetFiles(Config.TmpFolder, "*.tmp").ToList().ForEach(x => File.Delete(x));
            // all done
            Console.WriteLine("Koncano.");
        }