static void Main(string[] args) { StringBuilder featuresTsv = new StringBuilder(); StringBuilder compareTsv = new StringBuilder(); featuresTsv.AppendLine("Avtor,DRB,BI,HS,HL,DRL,BI-L,HS-L,HL-L,B/P,Zn./B,Zl./B,DKB,ARI,Flesch,Fog".Replace(",", "\t")); // setup logger Logger logger = Logger.GetRootLogger(); logger.LocalLevel = Logger.Level.Debug; logger.LocalOutputType = Logger.OutputType.Custom; Logger.CustomOutput = delegate(string loggerName, Logger.Level level, string funcName, Exception e, string message, object[] msgArgs) { Console.WriteLine(message, msgArgs); }; // load POS tagger models logger.Info("Main", "Nalagam modele za oblikoslovno analizo ..."); PartOfSpeechTagger posTagger = new PartOfSpeechTagger(POS_TAGGER_MODEL, LEMMATIZER_MODEL); // load and preprocess texts logger.Info("Main", "Nalagam podatke ..."); Dictionary <string, Author> authors = new Dictionary <string, Author>(); DirectoryInfo[] authorDirs = new DirectoryInfo(DATA_FOLDER).GetDirectories();//.Take(3).ToArray(); foreach (DirectoryInfo authorDir in authorDirs) { string authorName = authorDir.Name; bool isTaggedAuthor = authorName.Equals(UNKNOWN_AUTHOR, StringComparison.OrdinalIgnoreCase); logger.Info("Main", "Obravnavam avtorja \"" + authorName + "\" ..."); FileInfo[] authorFiles = authorDir.GetFiles("*.txt"); foreach (FileInfo authorFile in authorFiles) { string txt = File.ReadAllText(authorFile.FullName, Encoding.GetEncoding(DATA_ENCODING)); Match m = Regex.Match(txt, "^(.*?)(\r)?\n"); string title = m.Result("$1").Trim(); logger.Info("Main", "Obravnavam članek \"" + title + "\" ..."); // preprocess text Corpus corpus = new Corpus(); corpus.LoadFromTextSsjTokenizer(txt); posTagger.Tag(corpus); Text text = new Text(corpus, title, authorName); text.mIsTagged = isTaggedAuthor; Author author; if (!authors.TryGetValue(text.mAuthor, out author)) { author = new Author(text.mAuthor); author.mIsTagged = isTaggedAuthor; author.mTexts.Add(text); authors.Add(text.mAuthor, author); } else { author.mTexts.Add(text); } } } FunctionWordsModel fuw = new FunctionWordsModel(); fuw.Initialize(authors.Values); FrequentWordsModel frw = new FrequentWordsModel(); frw.Initialize(authors.Values); FrequentLemmasModel frl = new FrequentLemmasModel(); frl.Initialize(authors.Values); CharNGramsModel cng = new CharNGramsModel(); cng.Initialize(authors.Values); PosTagsModel pos = new PosTagsModel(); pos.Initialize(authors.Values); string[] vecNames = new string[] { "fuw", "frw", "frl", "cng", "pos" }; ModelBase[] modelBs = new ModelBase[] { fuw, frw, frl, cng, pos }; foreach (Author author in authors.Values) { author.ComputeFeatures(); for (int i = 0; i < vecNames.Length; i++) { author.mPredictions.Add(vecNames[i], modelBs[i].mModels[author.mName].Predict(author.mFeatureVectors[vecNames[i]])); } } ArrayList <Author> authorsArray = new ArrayList <Author>(authors.Values); foreach (Author author in authorsArray) { foreach (string feature in new ArrayList <string>(author.mFeatures.Keys)) { Features.GetFeatureRanking(author, authorsArray, feature); } } // write results logger.Info("Main", "Pišem rezultate ..."); foreach (string resName in "bootstrap.min.css,bootstrap.min.js,code.js,jquery.js,jquery.tablesorter.min.js,sort_asc.png,sort_both.png,sort_desc.png,styles.css".Split(',')) { CopyToOutput(resName, OUTPUT_PATH); } using (StreamWriter wIdx = new StreamWriter(OUTPUT_PATH + "\\index.html", /*append=*/ false, Encoding.UTF8)) { WriteHeader(wIdx); wIdx.WriteLine("<h1>Rezultati analize</h1>"); int authorNum = 0; foreach (KeyValuePair <string, Author> item in authors) { authorNum++; Author author = item.Value; wIdx.WriteLine("<h2>Avtor: {0}</h2>", HttpUtility.HtmlEncode(item.Key)); featuresTsv.Append(item.Key); if (author.mIsTagged) { wIdx.WriteLine("<div class='alert alert-info'><strong>Neznani avtor.</strong> <a href='{0}'>Primerjaj z ostalimi avtorji »</a></div>", "compare_" + authorNum + ".html"); } else { wIdx.WriteLine("<p><a href='{0}'>Primerjaj z ostalimi avtorji »</a></p>", "compare_" + authorNum + ".html"); } wIdx.WriteLine("<h3>Besedila</h3>"); wIdx.WriteLine("<ul>"); foreach (Text text in item.Value.mTexts) { wIdx.WriteLine("<li><a href='{1}'>{0} »</a></li>", HttpUtility.HtmlEncode(text.mName), text.mHtmlFileName); using (StreamWriter wDoc = new StreamWriter(OUTPUT_PATH + "\\" + text.mHtmlFileName, /*append=*/ false, Encoding.UTF8)) { // write document HTML WriteHeader(wDoc); wDoc.WriteLine("<div class='back'><a href='index.html'>« Seznam avtorjev</a></div>"); wDoc.WriteLine("<h1>Besedilo</h1>"); wDoc.WriteLine("<h2>{0}</h2>", HttpUtility.HtmlEncode(text.mName)); wDoc.WriteLine(text.GetHtml()); wDoc.WriteLine("<h1>Značilke</h1>"); wDoc.WriteLine("<h2>Obseg besedišča</h2>"); wDoc.WriteLine("<table class='table table-bordered table-striped'>"); wDoc.WriteLine("<thead>"); wDoc.WriteLine("<tr><th>Značilka</th><th>Vrednost</th></tr>"); wDoc.WriteLine("</thead>"); wDoc.WriteLine("<tbody>"); foreach (string featureNm in "ttr,brunet,honore,hl,ttrLemma,brunetLemma,honoreLemma,hlLemma".Split(',')) { WriteFeature(wDoc, featureNm, text.mFeatures[featureNm]); } wDoc.WriteLine("</tbody>"); wDoc.WriteLine("</table>"); wDoc.WriteLine("<h2>Berljivost</h2>"); wDoc.WriteLine("<table class='table table-bordered table-striped'>"); wDoc.WriteLine("<thead>"); wDoc.WriteLine("<tr><th>Značilka</th><th>Vrednost</th></tr>"); wDoc.WriteLine("</thead>"); wDoc.WriteLine("<tbody>"); foreach (string featureNm in "rWords,rChars,rSyllables,rComplex,ari,flesch,fog".Split(',')) { WriteFeature(wDoc, featureNm, text.mFeatures[featureNm]); } wDoc.WriteLine("</tbody>"); wDoc.WriteLine("</table>"); wDoc.WriteLine("<h2>Funkcijske besede</h2>"); wDoc.WriteLine("<p><a href='javascript:void(0)' data-toggle='collapse' data-target='#fuw'>Seznam funkcijskih besed</a></p>"); wDoc.WriteLine("<div id='fuw' class='collapse'>"); wDoc.WriteLine("<table class='table table-bordered table-striped'>"); wDoc.WriteLine("<thead>"); wDoc.WriteLine("<tr><th>Zap. št.</th><th>Beseda</th><th>Utež</th></tr>"); wDoc.WriteLine("</thead>"); wDoc.WriteLine("<tbody>"); int i = 0; foreach (KeyDat <double, Word> wordInfo in fuw.mBowSpace.GetKeywords(text.mFeatureVectors["fuw"]).Take(TOP_ITEMS_COUNT)) { wDoc.WriteLine("<tr><td>{0}.</td><td>{1}</td><td>{2:0.00}</td></tr>", ++i, HttpUtility.HtmlEncode(wordInfo.Dat.Stem), wordInfo.Key); } wDoc.WriteLine("</tbody>"); wDoc.WriteLine("</table>"); wDoc.WriteLine("</div>"); wDoc.WriteLine("<h2>Pogoste besede</h2>"); wDoc.WriteLine("<p><a href='javascript:void(0)' data-toggle='collapse' data-target='#frw'>Seznam pogostih besed</a></p>"); wDoc.WriteLine("<div id='frw' class='collapse'>"); wDoc.WriteLine("<table class='table table-bordered table-striped'>"); wDoc.WriteLine("<thead>"); wDoc.WriteLine("<tr><th>Zap. št.</th><th>Beseda</th><th>Utež</th></tr>"); wDoc.WriteLine("</thead>"); wDoc.WriteLine("<tbody>"); i = 0; foreach (KeyDat <double, Word> wordInfo in frw.mBowSpace.GetKeywords(text.mFeatureVectors["frw"]).Take(TOP_ITEMS_COUNT)) { wDoc.WriteLine("<tr><td>{0}.</td><td>{1}</td><td>{2:0.00}</td></tr>", ++i, HttpUtility.HtmlEncode(wordInfo.Dat.Stem), wordInfo.Key); } wDoc.WriteLine("</tbody>"); wDoc.WriteLine("</table>"); wDoc.WriteLine("</div>"); wDoc.WriteLine("<h2>Pogoste leme</h2>"); wDoc.WriteLine("<p><a href='javascript:void(0)' data-toggle='collapse' data-target='#frl'>Seznam pogostih lem</a></p>"); wDoc.WriteLine("<div id='frl' class='collapse'>"); wDoc.WriteLine("<table class='table table-bordered table-striped'>"); wDoc.WriteLine("<thead>"); wDoc.WriteLine("<tr><th>Zap. št.</th><th>Lema</th><th>Utež</th></tr>"); wDoc.WriteLine("</thead>"); wDoc.WriteLine("<tbody>"); i = 0; foreach (KeyDat <double, Word> wordInfo in frl.mBowSpace.GetKeywords(text.mFeatureVectors["frl"]).Take(TOP_ITEMS_COUNT)) { wDoc.WriteLine("<tr><td>{0}.</td><td>{1}</td><td>{2:0.00}</td></tr>", ++i, HttpUtility.HtmlEncode(wordInfo.Dat.Stem), wordInfo.Key); } wDoc.WriteLine("</tbody>"); wDoc.WriteLine("</table>"); wDoc.WriteLine("</div>"); wDoc.WriteLine("<h2>Znakovna zaporedja</h2>"); wDoc.WriteLine("<p><a href='javascript:void(0)' data-toggle='collapse' data-target='#cng'>Seznam znakovnih zaporedij</a></p>"); wDoc.WriteLine("<div id='cng' class='collapse'>"); wDoc.WriteLine("<table class='table table-bordered table-striped'>"); wDoc.WriteLine("<thead>"); wDoc.WriteLine("<tr><th>Zap. št.</th><th>Zaporedje</th><th>Utež</th></tr>"); wDoc.WriteLine("</thead>"); wDoc.WriteLine("<tbody>"); i = 0; foreach (KeyDat <double, Word> wordInfo in cng.mBowSpace.GetKeywords(text.mFeatureVectors["cng"]).Take(TOP_ITEMS_COUNT)) { wDoc.WriteLine("<tr><td>{0}.</td><td>{1}</td><td>{2:0.00}</td></tr>", ++i, HttpUtility.HtmlEncode(wordInfo.Dat.Stem), wordInfo.Key); } wDoc.WriteLine("</tbody>"); wDoc.WriteLine("</table>"); wDoc.WriteLine("</div>"); wDoc.WriteLine("<h2>Oblikoslovne oznake</h2>"); wDoc.WriteLine("<p><a href='javascript:void(0)' data-toggle='collapse' data-target='#pos'>Seznam oblikoslovnih oznak</a></p>"); wDoc.WriteLine("<div id='pos' class='collapse'>"); wDoc.WriteLine("<table class='table table-bordered table-striped'>"); wDoc.WriteLine("<thead>"); wDoc.WriteLine("<tr><th>Zap. št.</th><th>Zaporedje</th><th>Utež</th></tr>"); wDoc.WriteLine("</thead>"); wDoc.WriteLine("<tbody>"); i = 0; foreach (KeyDat <double, Word> wordInfo in pos.mBowSpace.GetKeywords(text.mFeatureVectors["pos"]).Take(TOP_ITEMS_COUNT)) { wDoc.WriteLine("<tr><td>{0}.</td><td>{1}</td><td>{2:0.00}</td></tr>", ++i, HttpUtility.HtmlEncode(wordInfo.Dat.Stem), wordInfo.Key); } wDoc.WriteLine("</tbody>"); wDoc.WriteLine("</table>"); wDoc.WriteLine("</div>"); WriteFooter(wDoc); } } wIdx.WriteLine("</ul>"); wIdx.WriteLine("<h3>Značilke</h3>"); wIdx.WriteLine("<h4>Obseg besedišča</h4>"); wIdx.WriteLine("<table class='table table-bordered table-striped'>"); wIdx.WriteLine("<thead>"); wIdx.WriteLine("<tr><th>Značilka</th><th>Vrednost</th><th>Std. odklon</th></tr>"); wIdx.WriteLine("</thead>"); wIdx.WriteLine("<tbody>"); foreach (string featureNm in "ttr,brunet,honore,hl,ttrLemma,brunetLemma,honoreLemma,hlLemma".Split(',')) { WriteFeature(wIdx, featureNm, author.GetAvg(featureNm), author.GetStdDev(featureNm)); featuresTsv.Append("\t" + author.GetAvg(featureNm)); } wIdx.WriteLine("</tbody>"); wIdx.WriteLine("</table>"); wIdx.WriteLine("<h4>Berljivost</h4>"); wIdx.WriteLine("<table class='table table-bordered table-striped'>"); wIdx.WriteLine("<thead>"); wIdx.WriteLine("<tr><th>Značilka</th><th>Vrednost</th><th>Std. odklon</th></tr>"); wIdx.WriteLine("</thead>"); wIdx.WriteLine("<tbody>"); foreach (string featureNm in "rWords,rChars,rSyllables,rComplex,ari,flesch,fog".Split(',')) { WriteFeature(wIdx, featureNm, author.GetAvg(featureNm), author.GetStdDev(featureNm)); featuresTsv.Append("\t" + author.GetAvg(featureNm)); } featuresTsv.AppendLine(); wIdx.WriteLine("</tbody>"); wIdx.WriteLine("</table>"); if (!author.mIsTagged) { wIdx.WriteLine("<h4>Razlikovalna moč značilk</h4>"); wIdx.WriteLine("<table class='tablesorter table table-bordered table-striped'>"); wIdx.WriteLine("<thead>"); wIdx.WriteLine("<tr><th>Značilka</th><th>Utež</th></tr>"); wIdx.WriteLine("</thead>"); wIdx.WriteLine("<tbody>"); foreach (string pKey in author.mFeatures.Keys.Where(x => x.StartsWith("p_"))) { WriteFeature(wIdx, pKey.Substring(2), author.GetAvg(pKey), author.GetStdDev(pKey), /*sameCell=*/ true); } wIdx.WriteLine("</tbody>"); wIdx.WriteLine("</table>"); } wIdx.WriteLine("<h4>Funkcijske besede</h4>"); wIdx.WriteLine("<p><a href='javascript:void(0)' data-toggle='collapse' data-target='#fuw_{0}'>Seznam funkcijskih besed</a></p>", authorNum); wIdx.WriteLine("<div id='fuw_{0}' class='collapse'>", authorNum); wIdx.WriteLine("<table class='table table-bordered table-striped'>"); wIdx.WriteLine("<thead>"); wIdx.WriteLine("<tr><th>Zap. št.</th><th>Beseda</th><th>Utež</th></tr>"); wIdx.WriteLine("</thead>"); wIdx.WriteLine("<tbody>"); int j = 0; foreach (Pair <string, double> word in author.GetTopVectorItems("fuw", TOP_ITEMS_COUNT, fuw.mBowSpace)) { wIdx.WriteLine("<tr><td>{0}.</td><td>{1}</td><td>{2:0.00}</td></tr>", ++j, HttpUtility.HtmlEncode(word.First), word.Second); } wIdx.WriteLine("</tbody>"); wIdx.WriteLine("</table>"); wIdx.WriteLine("</div>"); wIdx.WriteLine("<h4>Pogoste besede</h4>"); wIdx.WriteLine("<p><a href='javascript:void(0)' data-toggle='collapse' data-target='#frw_{0}'>Seznam pogostih besed</a></p>", authorNum); wIdx.WriteLine("<div id='frw_{0}' class='collapse'>", authorNum); wIdx.WriteLine("<table class='table table-bordered table-striped'>"); wIdx.WriteLine("<thead>"); wIdx.WriteLine("<tr><th>Zap. št.</th><th>Beseda</th><th>Utež</th></tr>"); wIdx.WriteLine("</thead>"); wIdx.WriteLine("<tbody>"); j = 0; foreach (Pair <string, double> word in author.GetTopVectorItems("frw", TOP_ITEMS_COUNT, frw.mBowSpace)) { wIdx.WriteLine("<tr><td>{0}.</td><td>{1}</td><td>{2:0.00}</td></tr>", ++j, HttpUtility.HtmlEncode(word.First), word.Second); } wIdx.WriteLine("</tbody>"); wIdx.WriteLine("</table>"); wIdx.WriteLine("</div>"); wIdx.WriteLine("<h4>Pogoste leme</h4>"); wIdx.WriteLine("<p><a href='javascript:void(0)' data-toggle='collapse' data-target='#frl_{0}'>Seznam pogostih lem</a></p>", authorNum); wIdx.WriteLine("<div id='frl_{0}' class='collapse'>", authorNum); wIdx.WriteLine("<table class='table table-bordered table-striped'>"); wIdx.WriteLine("<thead>"); wIdx.WriteLine("<tr><th>Zap. št.</th><th>Beseda</th><th>Utež</th></tr>"); wIdx.WriteLine("</thead>"); wIdx.WriteLine("<tbody>"); j = 0; foreach (Pair <string, double> word in author.GetTopVectorItems("frl", TOP_ITEMS_COUNT, frl.mBowSpace)) { wIdx.WriteLine("<tr><td>{0}.</td><td>{1}</td><td>{2:0.00}</td></tr>", ++j, HttpUtility.HtmlEncode(word.First), word.Second); } wIdx.WriteLine("</tbody>"); wIdx.WriteLine("</table>"); wIdx.WriteLine("</div>"); wIdx.WriteLine("<h4>Znakovna zaporedja</h4>"); wIdx.WriteLine("<p><a href='javascript:void(0)' data-toggle='collapse' data-target='#cng_{0}'>Seznam znakovnih zaporedij</a></p>", authorNum); wIdx.WriteLine("<div id='cng_{0}' class='collapse'>", authorNum); wIdx.WriteLine("<table class='table table-bordered table-striped'>"); wIdx.WriteLine("<thead>"); wIdx.WriteLine("<tr><th>Zap. št.</th><th>Zaporedje</th><th>Utež</th></tr>"); wIdx.WriteLine("</thead>"); wIdx.WriteLine("<tbody>"); j = 0; foreach (Pair <string, double> word in author.GetTopVectorItems("cng", TOP_ITEMS_COUNT, cng.mBowSpace)) { wIdx.WriteLine("<tr><td>{0}.</td><td>{1}</td><td>{2:0.00}</td></tr>", ++j, HttpUtility.HtmlEncode(word.First), word.Second); } wIdx.WriteLine("</tbody>"); wIdx.WriteLine("</table>"); wIdx.WriteLine("</table>"); wIdx.WriteLine("</div>"); wIdx.WriteLine("<h4>Oblikoslovne oznake</h4>"); wIdx.WriteLine("<p><a href='javascript:void(0)' data-toggle='collapse' data-target='#pos_{0}'>Seznam oblikoslovnih oznak</a></p>", authorNum); wIdx.WriteLine("<div id='pos_{0}' class='collapse'>", authorNum); wIdx.WriteLine("<table class='table table-bordered table-striped'>"); wIdx.WriteLine("<thead>"); wIdx.WriteLine("<tr><th>Zap. št.</th><th>Zaporedje</th><th>Utež</th></tr>"); wIdx.WriteLine("</thead>"); wIdx.WriteLine("<tbody>"); j = 0; foreach (Pair <string, double> word in author.GetTopVectorItems("pos", TOP_ITEMS_COUNT, pos.mBowSpace)) { wIdx.WriteLine("<tr><td>{0}.</td><td>{1}</td><td>{2:0.00}</td></tr>", ++j, HttpUtility.HtmlEncode(word.First), word.Second); } wIdx.WriteLine("</tbody>"); wIdx.WriteLine("</table>"); wIdx.WriteLine("</div>"); } WriteFooter(wIdx); } // write author-compare pages int n = 0; foreach (Author author in authors.Values) { string authorCompareFileName = OUTPUT_PATH + "\\compare_" + ++n + ".html"; using (StreamWriter wAuthorCmp = new StreamWriter(authorCompareFileName, /*append=*/ false, Encoding.UTF8)) { WriteHeader(wAuthorCmp); wAuthorCmp.WriteLine("<div class='back'><a href='index.html'>« Seznam avtorjev</a></div>"); wAuthorCmp.WriteLine("<h1>Primerjava</h1>"); wAuthorCmp.WriteLine("<h2>Avtor: {0}</h2>", HttpUtility.HtmlEncode(author.mName)); wAuthorCmp.WriteLine("<h3>Obseg besedišča</h3>"); wAuthorCmp.WriteLine("<table class='tablesorter table table-bordered table-striped'>"); wAuthorCmp.WriteLine("<thead>"); wAuthorCmp.WriteLine("<tr><th>Avtor</th><th>DRB</th><th>BI</th><th>HS</th><th>HL</th><th>DRL</th><th>BI-L</th><th>HS-L</th><th>HL-L</th></tr>"); wAuthorCmp.WriteLine("</thead>"); wAuthorCmp.WriteLine("<tbody>"); WriteAuthorCompareTable(wAuthorCmp, authors.Values, author, "ttr,brunet,honore,hl,ttrLemma,brunetLemma,honoreLemma,hlLemma".Split(','), /*isVec=*/ false); wAuthorCmp.WriteLine("</tbody>"); wAuthorCmp.WriteLine("</table>"); wAuthorCmp.WriteLine("<h3>Berljivost</h3>"); wAuthorCmp.WriteLine("<table class='tablesorter table table-bordered table-striped'>"); wAuthorCmp.WriteLine("<thead>"); wAuthorCmp.WriteLine("<tr><th>Avtor</th><th>B/P</th><th>Zn./B</th><th>Zl./B</th><th>DKB</th><th>ARI</th><th>Flesch</th><th>Fog</th></tr>"); wAuthorCmp.WriteLine("</thead>"); wAuthorCmp.WriteLine("<tbody>"); WriteAuthorCompareTable(wAuthorCmp, authors.Values, author, "rWords,rChars,rSyllables,rComplex,ari,flesch,fog".Split(','), /*isVec=*/ false); wAuthorCmp.WriteLine("</tbody>"); wAuthorCmp.WriteLine("</table>"); wAuthorCmp.WriteLine("<h3>Vektorji značilk</h3>"); wAuthorCmp.WriteLine("<table class='tablesorter table table-bordered table-striped'>"); wAuthorCmp.WriteLine("<thead>"); wAuthorCmp.WriteLine("<tr><th>Avtor</th><th>FB</th><th>PB</th><th>PL</th><th>ZZ</th><th>Ozn.</th></tr>"); wAuthorCmp.WriteLine("</thead>"); wAuthorCmp.WriteLine("<tbody>"); WriteAuthorCompareTable(wAuthorCmp, authors.Values, author, "fuw,frw,frl,cng,pos".Split(','), /*isVec=*/ true); wAuthorCmp.WriteLine("</tbody>"); wAuthorCmp.WriteLine("</table>"); WriteFooter(wAuthorCmp); // write to TSV compareTsv.AppendLine(); compareTsv.AppendLine(author.mName); compareTsv.AppendLine("Avtor,DRB,BI,HS,HL,DRL,BI-L,HS-L,HL-L,B/P,Zn./B,Zl./B,DKB,ARI,Flesch,Fog,FB,PB,PL,ZZ,Ozn.".Replace(",", "\t")); WriteAuthorCompareTable(compareTsv, authors.Values, author, "ttr,brunet,honore,hl,ttrLemma,brunetLemma,honoreLemma,hlLemma,rWords,rChars,rSyllables,rComplex,ari,flesch,fog,fuw,frw,frl,cng,pos".Split(',')); } } using (StreamWriter wTsv = new StreamWriter(OUTPUT_FILE, /*append=*/ false, Encoding.UTF8)) { wTsv.Write(featuresTsv.ToString()); wTsv.Write(compareTsv.ToString()); } }
static void Main(string[] args) { Console.WriteLine("Nalagam meta-podatke o blogih..."); LoadBlogMetaData(); Console.WriteLine("Nalagam oznacevalnik..."); PartOfSpeechTagger posTagger = new PartOfSpeechTagger(Config.PosTaggerModel, Config.LemmatizerModel); string pattern = "*.xml"; if (args.Length > 0) { pattern = args[0]; } Queue <string> fileNames = new Queue <string>(Directory.GetFiles(Config.DataFolder, pattern)); while (fileNames.Count > 0) { int n = 0; XmlDocument fullDoc = null; while (fileNames.Count > 0 && n < Config.BatchSize) { string fileName = fileNames.Dequeue(); if (File.Exists(MakeOutputFileName(fileName))) { Console.WriteLine("Ze obdelano: {0}.", fileName); continue; } if (File.Exists(MakeOutputFileName(fileName) + ".locked")) { Console.WriteLine("Zaklenjeno: {0}.", fileName); continue; } // load text Console.WriteLine("Datoteka: {0}...", fileName); XmlDocument tmpDoc = new XmlDocument(); string xml = File.ReadAllText(fileName); xml = xml.Replace("// ]]>", "").Replace("//--><!]]>", ""); tmpDoc.LoadXml(xml); string text = tmpDoc.SelectSingleNode("//besedilo").InnerText; if (text.Trim() == "") // *** empty documents are ignored { Console.WriteLine("*** Datoteka ne vsebuje besedila."); continue; } Corpus corpus = new Corpus(); corpus.LoadFromTextSsjTokenizer(text); // tag text Console.WriteLine("Oznacujem besedilo..."); posTagger.Tag(corpus); XmlDocument doc = new XmlDocument(); doc.LoadXml(corpus.ToString("XML-MI").Replace("xmlns=\"http://www.tei-c.org/ns/1.0\"", "")); // *** remove this f***ing namespace ((XmlElement)doc.SelectSingleNode("//text")).SetAttribute("fileName", fileName); // append text to fullDoc if (fullDoc == null) { fullDoc = doc; } else { XmlDocumentFragment xmlFrag = fullDoc.CreateDocumentFragment(); xmlFrag.InnerXml = doc.SelectSingleNode("//text").OuterXml; fullDoc.DocumentElement.AppendChild(xmlFrag); } n++; // check if meta-data exists //string key = tmpDoc.SelectSingleNode("//header/blog").InnerText; //if (!mBlogMetaData.ContainsKey(key)) //{ // Console.WriteLine("*** Cannot find meta-data for " + key); // return; //} } // nothing to do? if (fullDoc == null) { continue; } // save tagged text for parsing Console.WriteLine("Pripravljam datoteke za razclenjevanje..."); Guid tmpId = Guid.NewGuid(); string tmpFileNameIn = new FileInfo(Config.TmpFolder + "\\" + tmpId.ToString("N") + ".tmp").FullName; string tmpFileNameOut = new FileInfo(Config.TmpFolder + "\\" + tmpId.ToString("N") + ".out.tmp").FullName; XmlWriterSettings xmlSettings = new XmlWriterSettings(); xmlSettings.Encoding = Encoding.UTF8; xmlSettings.Indent = true; using (XmlWriter w = XmlWriter.Create(tmpFileNameIn, xmlSettings)) { fullDoc.Save(w); } // parse text Console.WriteLine("Zaganjam razclenjevalnik..."); Parser.Parse(tmpFileNameIn, tmpFileNameOut); // load results if (!File.Exists(tmpFileNameOut)) { // lock files and continue Console.WriteLine("*** Prislo je do napake pri razclenjevanju. Nadaljujem z obdelavo."); fullDoc.SelectNodes("//text").Cast <XmlElement>().ToList().ForEach(x => LockFile(x.Attributes["fileName"].Value)); continue; } fullDoc = new XmlDocument(); fullDoc.Load(tmpFileNameOut); // create output files Console.WriteLine("Pisem izhodne datoteke..."); foreach (XmlNode txtNode in fullDoc.SelectNodes("//text")) { string fileName = txtNode.Attributes["fileName"].Value; ((XmlElement)txtNode).RemoveAttribute("fileName"); Console.WriteLine("Datoteka: {0}...", fileName); XmlDocument tmpDoc = new XmlDocument(); string xml = File.ReadAllText(fileName); xml = xml.Replace("// ]]>", "").Replace("//--><!]]>", ""); tmpDoc.LoadXml(xml); // insert input XML into TEI-XML XmlDocument doc = new XmlDocument(); doc.LoadXml("<TEI>" + txtNode.OuterXml + "</TEI>"); XmlDocumentFragment docPart = doc.CreateDocumentFragment(); docPart.InnerXml = tmpDoc.OuterXml; doc.DocumentElement.PrependChild(docPart); // insert blog meta-data string key = doc.SelectSingleNode("//header/blog").InnerText; BlogMetaData metaData; if (!mBlogMetaData.ContainsKey(key)) { Console.WriteLine("*** Ne najdem podatkov o blogu \"{0}\".", key); continue; } else { Console.WriteLine("Vstavljam meta-podatke o blogu..."); metaData = mBlogMetaData[key]; XmlNode node = doc.SelectSingleNode("//header"); node.AppendChild(doc.CreateElement("blogSpletniNaslov")).InnerText = metaData.mBlogUrl; node.AppendChild(doc.CreateElement("blogNaslov")).InnerText = metaData.mBlogTitle; node.AppendChild(doc.CreateElement("blogNaslovKratek")).InnerText = metaData.mBlogTitleShort; //node.AppendChild(doc.CreateElement("avtorEMail")).InnerText = metaData.mAuthorEMail; node.AppendChild(doc.CreateElement("avtorSpol")).InnerText = metaData.mAuthorGender; node.AppendChild(doc.CreateElement("avtorStarost")).InnerText = metaData.mAuthorAge; node.AppendChild(doc.CreateElement("avtorRegija")).InnerText = metaData.mAuthorLocation; node.AppendChild(doc.CreateElement("avtorIzobrazba")).InnerText = metaData.mAuthorEducation; } // write results Console.WriteLine("Zapisujem rezultate..."); using (XmlWriter w = XmlWriter.Create(MakeOutputFileName(fileName), xmlSettings)) { doc.Save(w); } } } // purge temp folder Directory.GetFiles(Config.TmpFolder, "*.tmp").ToList().ForEach(x => File.Delete(x)); // all done Console.WriteLine("Koncano."); }