public static void get_UG_Document_content() { string[] UG_newcontent = new string[hapDoc.DocumentNode.InnerText.Length]; string[] UG_separatedContent = new string[hapDoc.DocumentNode.InnerText.Length]; UG_articles_Count = 0; string[] UG_articles_Matrix = { String.Empty }; using (StringReader sr = new StringReader(endText)) { int p = 0; string UG_line; while ((UG_line = sr.ReadLine()) != null) { UG_newcontent[p] = UG_line; UG_separatedContent = UG_line.Split(line_separator, 2); if (UG_separatedContent.Length == 1 & UG_separatedContent[0] == "") { continue; } else if (UG_separatedContent.Length == 1 && UG_articles_Matrix.Any(x => UG_separatedContent[0].Contains(x))) { if (UG_author_line != null && UG_Tytul != null) { using (var dbContext = new ArticleDBDataModelContainer()) { var document = new StringBuilder(); var ug_article = dbContext.UG_ArticlesSet.Create(); ug_article.article_author_line = UG_author_line; UG_author_line = null; ug_article.article_keywords = UG_slowa_kluczowe_j_ang_line; if (UG_slowa_kluczowe_j_ang_line != String.Empty || UG_slowa_kluczowe_j_ang_line != " " || UG_slowa_kluczowe_j_ang_line != null) { var termEngKeywords = TextPreparing.TermsPrepataions(UG_slowa_kluczowe_j_ang_line); document.Append(termEngKeywords); } UG_slowa_kluczowe_j_ang_line = null; ug_article.article_source = UG_Zrodlo; UG_Zrodlo = null; ug_article.article_title = UG_Tytul; if (UG_Tytul != String.Empty || UG_Tytul != " " || UG_Tytul != null) { var term_UG_Title = TextPreparing.TermsPrepataions(UG_Tytul); document.Append(term_UG_Title); } UG_Tytul = null; ug_article.article_DOI = UG_DOI; UG_DOI = null; for (int k = 0; k <= UG_autors.Length - 2;) { var authors_of_the_article = dbContext.AuthorSet.Create(); authors_of_the_article.author_name = UG_autors[k]; authors_of_the_article.author_surename = UG_autors[k + 1]; ug_article.Author.Add(authors_of_the_article); k += 2; } dbContext.UG_ArticlesSet.Add(ug_article); var _document = document.ToString().Split(' ', ';', ':', ','); for (int k = 0; k <= _document.Length - 1; k++) { var terms = dbContext.Terms_Vocabulary.Create(); string dictionary_text = File.ReadAllText(@"F:\Magistry files\csv_files\Allowed_term_dictionary.csv"); string[] allowed_dictionary = dictionary_text.Split(',', '\n'); for (int d = 0; d <= _document.Length - 1; d++) { for (int j = 0; j <= allowed_dictionary.Length - 1; j++) { if (_document[d].Length > 3 && _document[d].Contains(allowed_dictionary[j])) { continue; } else if (_document[d].Length <= 3 && !(_document[d].Contains(allowed_dictionary[j]))) { _document.ToList().RemoveAt(d); } } } //tutaj potrzebnie przepisac id dokumenta w ktorym wystepuje dane slowo if (_document[k] != String.Empty || _document[k] != " " || _document[k] != null || _document[k] != Char.IsDigit(' ').ToString() || dbContext.Terms_Vocabulary.Any(o => o.term_value != _document[k])) { //dbContext.Terms_Vocabulary.Where(u) var termVocabularyTable = dbContext.Terms_Vocabulary; terms.term_value = _document[k]; } try { ug_article.Terms_Vocabulary.Add(terms); } catch (Exception addingTermToDB) { File.WriteAllText(@"F:\\Magistry files\UG_crawler_Log.txt", DateTime.Now.ToString() + addingTermToDB.ToString()); } } try { dbContext.SaveChanges(); } catch (Exception ex) { File.WriteAllText(@"F:\\Magistry files\UG_crawler_Log.txt", DateTime.Now.ToString() + ex.ToString()); } } } else { File.WriteAllText(@"F:\\Magistry files\UG_crawler_Log.txt", "Empty line detected." + '\n'); } } else if (UG_separatedContent.Length == 2 && (UG_separatedContent[0].Contains("Liczba odnalezionych") || UG_separatedContent[0] == "Liczba odnalezionych rekordow")) { UG_articles_Count = Convert.ToInt32(UG_separatedContent[1]); UG_articles_Matrix = new string[UG_articles_Count]; for (int z = 0; z <= UG_articles_Count - 1; z++) { UG_articles_Matrix[z] = (z + 1) + "."; } } else if (UG_separatedContent.Length == 2 && UG_separatedContent[0].ToLower().Contains("autorzy")) { UG_author_line = UG_separatedContent[1]; UG_autors = UG_separatedContent[1].Split(autor_separators, StringSplitOptions.RemoveEmptyEntries); } else if (UG_separatedContent.Length == 2 && (UG_separatedContent[0].ToLower().Contains("tytu") || UG_separatedContent[0].ToLower().Contains("tytul") || UG_separatedContent[0].Contains("TYTUL") || UG_separatedContent[0] == "TYTUL[ROZDZIALU, FRAGMENTU]" || UG_separatedContent[0].Contains("TYTUL[ROZDZIALU, FRAGMENTU]") || UG_separatedContent[0].ToLower().Contains("TYTUL[ROZDZIALU, FRAGMENTU]"))) { UG_Tytul = UG_separatedContent[1]; } else if (UG_separatedContent.Length == 2 && UG_separatedContent[0].ToLower().Contains("zrodlo")) { UG_Zrodlo = UG_separatedContent[1]; } else if (UG_separatedContent.Length == 2 && UG_separatedContent[0].Contains("Slowa kluczowe w j. ang.")) { UG_Slowa_kluczowe_j_ang = UG_separatedContent[1].Split(separators); UG_slowa_kluczowe_j_ang_line = UG_separatedContent[1]; } else if (UG_separatedContent.Length == 2 && (UG_separatedContent[0] == "DOI" || UG_separatedContent.Contains("DOI") || UG_separatedContent[0].ToLower().Contains("doi"))) { UG_DOI = UG_separatedContent[1]; } p++; } #region Old_reader_code // 21.08.2018 - Old version of code /* * for (int i = 0; i <= hapDoc.DocumentNode.InnerText.Length; i++) * { * * UG_line = sr.ReadLine(); * if (UG_line != null) * { * UG_newcontent[i] = UG_line; * UG_separatedContent = UG_line.Split(line_separator, 2); * * if (UG_separatedContent.Length == 2 && UG_separatedContent[0].ToLower().Contains("autorzy")) * { * UG_author_line = UG_separatedContent[1]; * UG_autors = UG_separatedContent[1].Split(autor_separators, StringSplitOptions.RemoveEmptyEntries); * * } * else if (UG_separatedContent.Length == 2 && (UG_separatedContent[0].Contains("Liczba odnalezionych") || UG_separatedContent[0] == "Liczba odnalezionych rekordow")) * { * UG_articles_Count = Convert.ToInt32(UG_separatedContent[1]); * UG_articles_Matrix = new string[UG_articles_Count]; * for (int z = 0; z <= UG_articles_Count - 1; z++) * { * UG_articles_Matrix[z] = (z + 1) + "."; * } * } * else if (UG_separatedContent.Length == 1 && UG_articles_Matrix.Any(x => UG_separatedContent[0].Contains(x))) * { * if (UG_author_line != null && UG_Tytul != null) * { * using(var dbContext = new ArticleDBDataModelContainer()) * { * var document = new StringBuilder(); * var ug_article = dbContext.UG_ArticlesSet.Create(); * * ug_article.article_author_line = UG_author_line; * UG_author_line = null; * * ug_article.article_keywords = UG_slowa_kluczowe_j_ang_line; * if (UG_slowa_kluczowe_j_ang_line != String.Empty || UG_slowa_kluczowe_j_ang_line != " " || UG_slowa_kluczowe_j_ang_line != null) * { * var termEngKeywords = TextPreparing.TermsPrepataions(UG_slowa_kluczowe_j_ang_line); * document.Append(termEngKeywords); * } * UG_slowa_kluczowe_j_ang_line = null; * * ug_article.article_source = UG_Zrodlo; * UG_Zrodlo = null; * * ug_article.article_title = UG_Tytul; * if (UG_Tytul != String.Empty || UG_Tytul != " " || UG_Tytul != null) * { * var term_UG_Title = TextPreparing.TermsPrepataions(UG_Tytul); * document.Append(term_UG_Title); * } * UG_Tytul = null; * * ug_article.article_DOI = UG_DOI; * UG_DOI = null; * * for (int k = 0; k <= UG_autors.Length - 2;) * { * var authors_of_the_article = dbContext.AuthorSet.Create(); * authors_of_the_article.author_name = UG_autors[k]; * authors_of_the_article.author_surename = UG_autors[k + 1]; * ug_article.Author.Add(authors_of_the_article); * k += 2; * } * dbContext.UG_ArticlesSet.Add(ug_article); * var _document = document.ToString().Split(' ', ';', ':', ','); * for (int k = 0; k <= _document.Length - 1; k++) * { * var terms = dbContext.Terms_Vocabulary.Create(); * string dictionary_text = File.ReadAllText(@"F:\Magistry files\csv_files\Allowed_term_dictionary.csv"); * string[] allowed_dictionary = dictionary_text.Split(',', '\n'); * * for (int d = 0; d <= _document.Length - 1; d++) * { * for (int j = 0; j <= allowed_dictionary.Length - 1; j++) * if (_document[d].Length > 3 && _document[d].Contains(allowed_dictionary[j])) * continue; * else if (_document[d].Length <= 3 && !(_document[d].Contains(allowed_dictionary[j]))) * _document.ToList().RemoveAt(d); * } * * //tutaj potrzebnie przepisac id dokumenta w ktorym wystepuje dane slowo * if (_document[k] != String.Empty || _document[k] != " " || _document[k] != null || _document[k] != Char.IsDigit(' ').ToString()) * { * //dbContext.Terms_Vocabulary.Where(u) * var termVocabularyTable = dbContext.Terms_Vocabulary; * terms.term_value = _document[k]; * * } * ug_article.Terms_Vocabulary.Add(terms); * } * try * { * dbContext.SaveChanges(); * } * catch(Exception ex) * { * File.WriteAllText(@"F:\\Magistry files\UG_crawler_Log.txt", ex.ToString()); * } * } * * ///<summary> * /// UGArticle_Entity_Object_Creation * /// </summary> #region UGArticle_Entity_Object_Creation * using (var db = new PublicationsContext()) * { * var ug_article = new UGArticle(); * ug_article.article_author_line = UG_author_line; * UG_author_line = null; * ug_article.article_keywords = UG_slowa_kluczowe_j_ang_line; * UG_slowa_kluczowe_j_ang_line = null; * ug_article.article_source = UG_Zrodlo; * UG_Zrodlo = null; * ug_article.article_title = UG_Tytul; * UG_Tytul = null; * ug_article.article_DOI = UG_DOI; * UG_DOI = null; * * var authors_of_the_article = new Authors(); * for (int k = 0; k <= UG_autors.Length - 2; k++) * { * authors_of_the_article.author_name = UG_autors[k]; * authors_of_the_article.author_surename = UG_autors[k + 1]; * authors_of_the_article.article_Id = ug_article.article_Id; * * db.Authors.Add(authors_of_the_article); * } * * //authors_of_the_article.UG_Articles.Add(ug_article); * db.UG_Articles.Add(ug_article); * db.SaveChanges(); * } * else * { * File.WriteAllText(@"F:\\Magistry files\UG_crawler_Log.txt", "Empty line detected."+'\n'); * } * } * else if (UG_separatedContent.Length == 2 && (UG_separatedContent[0].ToLower().Contains("tytu") || UG_separatedContent[0].ToLower().Contains("tytul") || UG_separatedContent[0].Contains("TYTUL") || UG_separatedContent[0]=="TYTUL[ROZDZIALU, FRAGMENTU]" || UG_separatedContent[0].Contains("TYTUL[ROZDZIALU, FRAGMENTU]") || UG_separatedContent[0].ToLower().Contains("TYTUL[ROZDZIALU, FRAGMENTU]"))) * { * UG_Tytul = UG_separatedContent[1]; * } * else if (UG_separatedContent.Length == 2 && UG_separatedContent[0].ToLower().Contains("zrodlo")){ * UG_Zrodlo = UG_separatedContent[1]; * } * else if (UG_separatedContent.Length == 2 && UG_separatedContent[0].Contains("Slowa kluczowe w j. ang.")) * { * UG_Slowa_kluczowe_j_ang = UG_separatedContent[1].Split(separators); * UG_slowa_kluczowe_j_ang_line = UG_separatedContent[1]; * } * else if (UG_separatedContent.Length == 2 && (UG_separatedContent[0] == "DOI" || UG_separatedContent.Contains("DOI") || UG_separatedContent[0].ToLower().Contains("doi"))) * { * UG_DOI = UG_separatedContent[1]; * } * } * } */ #endregion } }
public static void get_PP_Document_content() { string[] PP_newcontent = new string[hapDoc.DocumentNode.InnerText.Length]; string[] PP_separatedContent = new string[hapDoc.DocumentNode.InnerText.Length]; PP_articles_Count = 0; string[] PP_articles_Matrix = { String.Empty }; using (StringReader sr = new StringReader(endText)) { int p = 0; string PP_line; while ((PP_line = sr.ReadLine()) != null) { PP_newcontent[p] = PP_line; PP_separatedContent = PP_line.Split(line_separator, 2); if (PP_separatedContent.Length == 1 & PP_separatedContent[0] == "") { continue; } else if (PP_separatedContent.Length == 1 && PP_articles_Matrix.Any(x => PP_separatedContent[0].Contains(x))) { if (PP_author_line != null && PP_Tytul != null) { try { using (var PPdbContext = new ArticleDBDataModelContainer()) { var document = new StringBuilder(); var pp_article = PPdbContext.PP_ArticlesSet.Create(); pp_article.article_author_line = PP_author_line; PP_author_line = null; pp_article.article_title = PP_Tytul; if (PP_Tytul != String.Empty || PP_Tytul != " " || PP_Tytul != null) { var termTitlePP = TextPreparing.TermsPrepataions(PP_Tytul); document.Append(termTitlePP); } PP_Tytul = null; pp_article.article_source = PP_Zrodlo; if (PP_Zrodlo != String.Empty || PP_Zrodlo != " " || PP_Zrodlo != null) { var termSourcePP = TextPreparing.TermsPrepataions(PP_Zrodlo); document.Append(termSourcePP); } else { PP_Zrodlo = "Not defined"; document.Append(PP_Zrodlo); } PP_Zrodlo = null; pp_article.article_year = PP_Rok; PP_Rok = 0; pp_article.article_language = PP_Jezyk_Publikacji; PP_Jezyk_Publikacji = null; pp_article.article_DOI = PP_DOI; PP_DOI = null; /* * pp_article.article_details = PP_Uwagi; * PP_Uwagi = null; * pp_article.article_URL = PP_Adres_URL; * PP_Adres_URL = null; */ for (int z = 0; z <= PP_autors.Length - 4;) { var authors_of_the_PP_article = PPdbContext.AuthorSet.Create(); if (PP_autors[z] != "IC)") { authors_of_the_PP_article.author_name = PP_autors[z + 1]; authors_of_the_PP_article.author_surename = PP_autors[z]; pp_article.Author.Add(authors_of_the_PP_article); } z += 4; } PPdbContext.PP_ArticlesSet.Add(pp_article); var _document = document.ToString().Split(' ', ';', ':', ','); for (int k = 0; k <= _document.Length - 1; k++) { var terms = PPdbContext.Terms_Vocabulary.Create(); string dictionary_text = File.ReadAllText(@"F:\Magistry files\csv_files\Allowed_term_dictionary.csv"); string[] allowed_dictionary = dictionary_text.Split(',', '\n'); for (int d = 0; d <= _document.Length - 1; d++) { for (int j = 0; j <= allowed_dictionary.Length - 1; j++) { if (_document[d].Length > 3 && _document[d].Contains(allowed_dictionary[j])) { continue; } else if (_document[d].Length <= 3 && !(_document[d].Contains(allowed_dictionary[j]))) { _document.ToList().RemoveAt(d); } } } //tutaj potrzebnie przepisac id dokumenta w ktorym wystepuje dane slowo if (_document[k] != String.Empty || _document[k] != " " || _document[k] != null || _document[k] != Char.IsDigit(' ').ToString()) { //dbContext.Terms_Vocabulary.Where(u) var termVocabularyTable = PPdbContext.Terms_Vocabulary; terms.term_value = _document[k]; } pp_article.Terms_Vocabulary.Add(terms); } PPdbContext.SaveChanges(); } } catch (Exception ex) { File.WriteAllText(@"F:\\Magistry files\PP_crawler_Log.txt", ex.ToString()); } } else { File.WriteAllText(@"F:\\Magistry files\PP_crawler_Log.txt", "Empty line detected." + '\n'); } } else if (PP_separatedContent.Length == 2 && (PP_separatedContent[0].Contains("Liczba odnalezionych") || PP_separatedContent[0] == "Liczba odnalezionych rekordow")) { PP_articles_Count = Convert.ToInt32(PP_separatedContent[1]); PP_articles_Matrix = new string[PP_articles_Count]; for (int l = 0; l <= PP_articles_Count - 1; l++) { PP_articles_Matrix[l] = (l + 1) + "."; } } if (PP_separatedContent.Length == 2 && (PP_separatedContent[0].ToLower().Contains("autor") || PP_separatedContent[0].Contains("Autor") || PP_separatedContent[0] == "Autor")) { PP_author_line = PP_separatedContent[1]; var PP_author_line_modified = PP_author_line.Replace("(", String.Empty); PP_autors = PP_separatedContent[1].Split(autor_separators, StringSplitOptions.RemoveEmptyEntries); } else if (PP_separatedContent.Length == 2 && (PP_separatedContent[0].ToLower().Contains("tytu") || PP_separatedContent[0].ToLower().Contains("tytul") || PP_separatedContent[0].Contains("Tytul"))) { PP_Tytul = PP_separatedContent[1]; } else if (PP_separatedContent.Length == 2 && (PP_separatedContent[0].Contains("Zrodlo") || PP_separatedContent[0].ToLower().Contains("zrodlo"))) { PP_Zrodlo = PP_separatedContent[1]; } else if (PP_separatedContent.Length == 2 && (PP_separatedContent[0].Contains("Rok") || PP_separatedContent[0].ToLower().Contains("rok"))) { string rok = ""; if (PP_separatedContent[1] != "" | PP_separatedContent[1] == String.Empty) { rok = null; } else { rok = PP_separatedContent[1].Substring(0, 5); } PP_Rok = Convert.ToInt32(rok); } else if (PP_separatedContent.Length == 2 && (PP_separatedContent[0].Contains("Jezyk publikacji") || PP_separatedContent[0].ToLower().Contains("jezyk publikacji") || PP_separatedContent[0].Contains("Język publikacji") || PP_separatedContent[0].ToLower().Contains("język publikacji"))) { PP_Jezyk_Publikacji = PP_separatedContent[1]; } else if (PP_separatedContent.Length == 2 && (PP_separatedContent[0].Contains("DOI") || PP_separatedContent[0].ToLower().Contains("doi") || PP_separatedContent[0] == "DOI")) { PP_DOI = PP_separatedContent[1]; } p++; } #region Old_code /* 22.08.2018 - old version * for (int i = 0; i <= hapDoc.DocumentNode.InnerText.Length; i++) * { * PP_line = sr.ReadLine(); * int counter = 0; * if (PP_line != null) * { * PP_newcontent[i] = PP_line; * PP_separatedContent = PP_line.Split(line_separator,2); * * * if (PP_separatedContent.Length == 2 && (PP_separatedContent[0].ToLower().Contains("autor") || PP_separatedContent[0].Contains("Autor") || PP_separatedContent[0] == "Autor")) * { * //System.Windows.MessageBox.Show(PP_separatedContent[1]); * PP_author_line = PP_separatedContent[1]; * var PP_author_line_modified = PP_author_line.Replace("(", String.Empty); * * PP_autors = PP_separatedContent[1].Split(autor_separators, StringSplitOptions.RemoveEmptyEntries); * * } * else if (PP_separatedContent.Length == 2 && (PP_separatedContent[0].Contains("Liczba odnalezionych") || PP_separatedContent[0] == "Liczba odnalezionych rekordow")) * { * PP_articles_Count = Convert.ToInt32(PP_separatedContent[1]); * PP_articles_Matrix = new string[PP_articles_Count]; * for (int l = 0; l <= PP_articles_Count - 1; l++) * { * PP_articles_Matrix[l] = (l + 1) + "."; * } * } * else if (PP_separatedContent.Length == 1 && PP_articles_Matrix.Any(x => PP_separatedContent[0].Contains(x))) * { * if (PP_author_line != null && PP_Tytul != null) * { * ///<summary> * ///PPArticle_Entity_Object_creation_Model_first * /// </summary> * try * { #region PP_Article_Object_creation_Model_First * using (var PPdbContext = new ArticleDBDataModelContainer()) * { * var document = new StringBuilder(); * var pp_article = PPdbContext.PP_ArticlesSet.Create(); * * pp_article.article_author_line = PP_author_line; * PP_author_line = null; * * pp_article.article_title = PP_Tytul; * if (PP_Tytul != String.Empty || PP_Tytul != " " || PP_Tytul != null) * { * var termTitlePP = TextPreparing.TermsPrepataions(PP_Tytul); * document.Append(termTitlePP); * } * PP_Tytul = null; * * pp_article.article_source = PP_Zrodlo; * if (PP_Zrodlo != String.Empty || PP_Zrodlo != " " || PP_Zrodlo != null) * { * var termSourcePP = TextPreparing.TermsPrepataions(PP_Zrodlo); * document.Append(termSourcePP); * } * PP_Zrodlo = null; * * pp_article.article_year = PP_Rok; * PP_Rok = 0; * pp_article.article_language = PP_Jezyk_Publikacji; * PP_Jezyk_Publikacji = null; * pp_article.article_DOI = PP_DOI; * PP_DOI = null; * // * pp_article.article_details = PP_Uwagi; * PP_Uwagi = null; * pp_article.article_URL = PP_Adres_URL; * PP_Adres_URL = null; * // * * for (int z = 0; z <= PP_autors.Length - 4;) * { * var authors_of_the_PP_article = PPdbContext.AuthorSet.Create(); * if (PP_autors[z] != "IC)") * { * authors_of_the_PP_article.author_name = PP_autors[z + 1]; * authors_of_the_PP_article.author_surename = PP_autors[z]; * pp_article.Author.Add(authors_of_the_PP_article); * } * z += 4; * } * PPdbContext.PP_ArticlesSet.Add(pp_article); * * var _document = document.ToString().Split(' ', ';', ':', ','); * for (int k = 0; k <= _document.Length - 1; k++) * { * var terms = PPdbContext.Terms_Vocabulary.Create(); * * // * string dictionary_text = File.ReadAllText(@"F:\Magistry files\csv_files\Allowed_term_dictionary.csv"); * string[] allowed_dictionary = dictionary_text.Split(',', '\n'); * * for (int p = 0; p <= _document.Length - 1; p++) * { * for (int j = 0; j <= allowed_dictionary.Length - 1; j++) * { * if (_document[p].Length > 3 && _document[p].Contains(allowed_dictionary[j])) * { * continue; * } * else if (_document[p].Length <= 3 && !(_document[p].Contains(allowed_dictionary[j]))) * { * _document.ToList().RemoveAt(p); * } * * } * } * //tutaj potrzebnie przepisac id dokumenta w ktorym wystepuje dane slowo * if (_document[k] != String.Empty || _document[k] != " " || _document[k] != null || _document[k] != Char.IsDigit(' ').ToString()) * { * //dbContext.Terms_Vocabulary.Where(u) * var termVocabularyTable = PPdbContext.Terms_Vocabulary; * terms.term_value = _document[k]; * * } * pp_article.Terms_Vocabulary.Add(terms); * } * * PPdbContext.SaveChanges(); * } #endregion * } * catch (Exception ex) * { * File.WriteAllText(@"F:\\Magistry files\PP_crawler_Log.txt", ex.ToString()); * } * ///<summary> * /// PPArticle_Entity_Object_Creation * /// </summary> #region PPArticle_Entity_Object_Creation * /* * using (var dbppcontext = new PublicationsContext()) * { * var pp_article = new PPArticle(); * pp_article.article_author_line = PP_author_line; * PP_author_line = null; * pp_article.article_title = PP_Tytul; * PP_Tytul = null; * pp_article.article_source = PP_Zrodlo; * PP_Zrodlo = null; * pp_article.article_year = PP_Rok; * PP_Rok = 0; * pp_article.article_language = PP_Jezyk_Publikacji; * PP_Jezyk_Publikacji = null; * pp_article.article_DOI = PP_DOI; * PP_DOI = null; * pp_article.article_details = PP_Uwagi; * PP_Uwagi = null; * pp_article.article_URL = PP_Adres_URL; * PP_Adres_URL = null; * * * * var authors_of_the_article = new Authors(); * for (int k = 0; k <= PP_autors.Length - 2; k++) * { * authors_of_the_article.author_name = PP_autors[k]; * authors_of_the_article.author_surename = PP_autors[k + 1]; * dbppcontext.Authors.Add(authors_of_the_article); * * } * //dbppcontext.PP_Articles.Add(pp_article); * dbppcontext.PP_Articles.Attach(pp_article); * dbppcontext.Entry(pp_article).State = System.Data.Entity.EntityState.Added; * dbppcontext.SaveChanges(); * //dbppcontext.SaveChanges(); * } * // * //#endregion * } * else * { * //System.Windows.MessageBox.Show("Brak danych"); * } * * } * else if (PP_separatedContent.Length == 2 && (PP_separatedContent[0].ToLower().Contains("tytu") || PP_separatedContent[0].ToLower().Contains("tytul") || PP_separatedContent[0].Contains("Tytul"))) * { * PP_Tytul = PP_separatedContent[1]; * //System.Windows.MessageBox.Show(PP_Tytul); * } * else if (PP_separatedContent.Length == 2 && (PP_separatedContent[0].Contains("Zrodlo") || PP_separatedContent[0].ToLower().Contains("zrodlo"))) * { * PP_Zrodlo = PP_separatedContent[1]; * //System.Windows.MessageBox.Show(PP_Zrodlo); * } * else if (PP_separatedContent.Length == 2 && (PP_separatedContent[0].Contains("Rok") || PP_separatedContent[0].ToLower().Contains("rok"))) * { * var rok = PP_separatedContent[1].Substring(0, 5); * PP_Rok = Convert.ToInt32(rok); * //System.Windows.MessageBox.Show(PP_Rok.ToString()); * } * else if (PP_separatedContent.Length == 2 && (PP_separatedContent[0].Contains("Jezyk publikacji") || PP_separatedContent[0].ToLower().Contains("jezyk publikacji") || PP_separatedContent[0].Contains("Język publikacji") || PP_separatedContent[0].ToLower().Contains("język publikacji"))) * { * PP_Jezyk_Publikacji = PP_separatedContent[1]; * //System.Windows.MessageBox.Show(PP_Jezyk_Publikacji); * } * else if (PP_separatedContent.Length == 2 && (PP_separatedContent[0].Contains("DOI") || PP_separatedContent[0].ToLower().Contains("doi") || PP_separatedContent[0] == "DOI")) * { * PP_DOI = PP_separatedContent[1]; * //System.Windows.MessageBox.Show(PP_DOI); * } * /* * else if (PP_separatedContent.Length == 2 && (PP_separatedContent[0].Contains("Uwagi") || PP_separatedContent[0].ToLower().Contains("uwagi") || PP_separatedContent[0] == "Uwagi")) * { * PP_Uwagi = PP_separatedContent[1]; * System.Windows.MessageBox.Show(PP_Uwagi); * } * else if (PP_separatedContent.Length == 2 && (PP_separatedContent[0].Contains("Adres url") || PP_separatedContent[0].ToLower().Contains("adres url") || PP_separatedContent[0] == "Adres url")) * { * PP_Adres_URL = PP_separatedContent[1]; * System.Windows.MessageBox.Show(PP_Adres_URL = PP_separatedContent[1]); * } * // * * //else if (PP_separatedContent.Length == 1 && PP_separatedContent[0] == String.Empty) System.Windows.MessageBox.Show("The empty line detected", "Empty line", System.Windows.MessageBoxButton.OK); * else * { * //System.Windows.MessageBox.Show("Error! Content not found!", "Error!", System.Windows.MessageBoxButton.OK); * * } * counter++; * } * } */ #endregion } }
//potrzebnie zaimplementowac divide and conquer dla duzych plikow public static void get_WSB_Document_content() { string[] WSB_newcontent = new string[hapDoc.DocumentNode.InnerText.Length]; string[] WSB_separatedContent = new string[hapDoc.DocumentNode.InnerText.Length]; WSB_articles_Count = 0; string[] WSB_articles_Matrix = { String.Empty }; using (StringReader sr = new StringReader(endText)) { int p = 0; string WSB_line; // 22.08.2018 New version of reader while ((WSB_line = sr.ReadLine()) != null) { WSB_newcontent[p] = WSB_line; WSB_separatedContent = WSB_line.Split(line_separator, 2); if (WSB_separatedContent.Length == 1 & WSB_separatedContent[0] == "") { continue; } else if (WSB_separatedContent.Length == 1 & WSB_articles_Matrix.Any(x => WSB_separatedContent[0].Contains(x))) { if (WSB_author_line != null & WSB_Tytul_pracy != null) { using (var dbContext = new ArticleDBDataModelContainer()) { var document = new StringBuilder(); var wsb_article = dbContext.WSB_ArticlesSet.Create(); if (WSB_author_line == null) { WSB_author_line = "Not_defined"; } wsb_article.article_authors = WSB_author_line; WSB_author_line = null; if (WSB_Tytul_pracy == null) { WSB_Tytul_pracy = "Not_defined"; } wsb_article.article_title = WSB_Tytul_pracy; if (WSB_Tytul_pracy != String.Empty | WSB_Tytul_pracy != " " | WSB_Tytul_pracy != null) { var termTitle_WSB = TextPreparing.TermsPrepataions(WSB_Tytul_pracy); document.Append(termTitle_WSB); } WSB_Tytul_pracy = null; if (WSB_Adres_wydawniczy == null) { WSB_Adres_wydawniczy = "Not_defined"; } wsb_article.article_publisher_adres = WSB_Adres_wydawniczy; WSB_Adres_wydawniczy = null; if (WSB_Tytul_calosci == null) { WSB_Tytul_calosci = "Not_defined"; } wsb_article.article_common_title = WSB_Tytul_calosci; if (WSB_Tytul_calosci != String.Empty | WSB_Tytul_calosci != " " | WSB_Tytul_calosci != null) { var termFullTitle_WSB = TextPreparing.TermsPrepataions(WSB_Tytul_calosci); document.Append(termFullTitle_WSB); } WSB_Tytul_calosci = null; if (WSB_Slowa_kluczowe_j_pl_line == null) { WSB_Slowa_kluczowe_j_pl_line = "Not_defined"; } wsb_article.article_pl_keywords = WSB_Slowa_kluczowe_j_pl_line; if (WSB_Slowa_kluczowe_j_pl_line != String.Empty | WSB_Slowa_kluczowe_j_pl_line != " " | WSB_Slowa_kluczowe_j_pl_line != null) { var term_PL_Keywords_WSB = TextPreparing.TermsPrepataions(WSB_Slowa_kluczowe_j_pl_line); document.Append(term_PL_Keywords_WSB); } WSB_Slowa_kluczowe_j_pl_line = null; if (WSB_Slowa_kluczowe_j_ang_line == null) { WSB_Slowa_kluczowe_j_ang_line = "Not_defined"; } wsb_article.article_eng_keywords = WSB_Slowa_kluczowe_j_ang_line; if (WSB_Slowa_kluczowe_j_ang_line != String.Empty | WSB_Slowa_kluczowe_j_ang_line != " " | WSB_Slowa_kluczowe_j_ang_line != null) { var term_Eng_Keywords_WSB = TextPreparing.TermsPrepataions(WSB_Slowa_kluczowe_j_ang_line); document.Append(term_Eng_Keywords_WSB); } WSB_Slowa_kluczowe_j_ang_line = null; if (WSB_Tytul_pracy_w_innym_j == null) { WSB_Tytul_pracy_w_innym_j = "Not_defined"; } wsb_article.article_title_other_lang = WSB_Tytul_pracy_w_innym_j; if (WSB_Tytul_pracy_w_innym_j != String.Empty | WSB_Tytul_pracy_w_innym_j != " " | WSB_Tytul_pracy_w_innym_j != null) { var term_Title_Other_Lang_WSB = TextPreparing.TermsPrepataions(WSB_Tytul_pracy_w_innym_j); document.Append(term_Title_Other_Lang_WSB); } WSB_Tytul_pracy_w_innym_j = null; if (WSB_Szczegoly == null) { WSB_Szczegoly = "Not_defined"; } wsb_article.article_details = WSB_Szczegoly; WSB_Szczegoly = null; if (WSB_URL == null) { WSB_URL = "Not_defined"; } wsb_article.article_URL = WSB_URL; WSB_URL = null; if (WSB_DOI == null) { WSB_DOI = "Not_defined"; } wsb_article.article_DOI = WSB_DOI; WSB_DOI = null; for (int k = 0; k <= WSB_autors.Length - 2;) { var authors_of_the_article = dbContext.AuthorSet.Create(); authors_of_the_article.author_name = WSB_autors[k]; authors_of_the_article.author_surename = WSB_autors[k + 1]; wsb_article.Author.Add(authors_of_the_article); k += 2; } dbContext.WSB_ArticlesSet.Add(wsb_article); var _document = document.ToString().Split(' ', ';', ':', ','); for (int k = 0; k <= _document.Length - 1; k++) { var terms = dbContext.Terms_Vocabulary.Create(); string dictionary_text = File.ReadAllText(@"F:\Magistry files\csv_files\Allowed_term_dictionary.csv"); string[] allowed_dictionary = dictionary_text.Split(',', '\n'); for (int d = 0; d <= _document.Length - 1; d++) { for (int j = 0; j <= allowed_dictionary.Length - 1; j++) { if (_document[d].Length > 3 & _document[d].Contains(allowed_dictionary[j])) { continue; } else if (_document[d].Length <= 3 & !(_document[d].Contains(allowed_dictionary[j]))) { _document.ToList().RemoveAt(d); } } } //tutaj potrzebnie przepisac id dokumenta w ktorym wystepuje dane slowo if (_document[k] != String.Empty | _document[k] != " " | _document[k] != null | _document[k] != Char.IsDigit(' ').ToString()) { //dbContext.Terms_Vocabulary.Where(u) var termVocabularyTable = dbContext.Terms_Vocabulary; terms.term_value = _document[k]; } wsb_article.Terms_Vocabulary.Add(terms); } try { dbContext.SaveChanges(); } catch (Exception ex) { File.WriteAllText(@"F:\\Magistry files\WSB_crawler_Log.txt", ex.ToString()); } } } else { continue; } } else if (WSB_separatedContent.Length == 2 & (WSB_separatedContent[0].ToLower().Contains("autor") | WSB_separatedContent[0].Contains("Autor") | WSB_separatedContent[0] == "Autorzy")) { WSB_autors = WSB_separatedContent[1].Split(autor_separators, StringSplitOptions.RemoveEmptyEntries); WSB_author_line = WSB_separatedContent[1]; } else if (WSB_separatedContent.Length == 2 & (WSB_separatedContent[0].ToLower().Contains("tytul pracy") | WSB_separatedContent[0].Contains("Tytul pracy") | WSB_separatedContent[0] == "Tytul pracy")) { WSB_Tytul_pracy = WSB_separatedContent[1]; } else if (WSB_separatedContent.Length == 2 & (WSB_separatedContent[0].Contains("Liczba odnalezionych") | WSB_separatedContent[0] == "Liczba odnalezionych rekordow")) { WSB_articles_Count = Convert.ToInt32(WSB_separatedContent[1]); WSB_articles_Matrix = new string[WSB_articles_Count]; for (int z = 0; z <= WSB_articles_Count - 1; z++) { WSB_articles_Matrix[z] = (z + 1) + "."; } } else if (WSB_separatedContent.Length == 2 & (WSB_separatedContent[0].ToLower().Contains("adres wydawniczy") | WSB_separatedContent[0].Contains("Adres wydawniczy") | WSB_separatedContent[0] == "Adres wydawniczy")) { WSB_Adres_wydawniczy = WSB_separatedContent[1]; } else if (WSB_separatedContent.Length == 2 & (WSB_separatedContent[0].ToLower().Contains("polskie hasla") | WSB_separatedContent[0].Contains("Polskie hasla") | WSB_separatedContent[0] == "Polskie hasla przedmiotowe")) { WSB_Slowa_kluczowe_j_pl = WSB_separatedContent[1].Split(separators, StringSplitOptions.RemoveEmptyEntries); WSB_Slowa_kluczowe_j_pl_line = WSB_separatedContent[1]; } else if (WSB_separatedContent.Length == 2 & (WSB_separatedContent[0].ToLower().Contains("angielskie hasla") | WSB_separatedContent[0].Contains("Angielskie hasla") | WSB_separatedContent[0] == "Angielskie hasla przedmiotowe")) { WSB_Slowa_kluczowe_j_ang = WSB_separatedContent[1].Split(separators, StringSplitOptions.RemoveEmptyEntries); WSB_Slowa_kluczowe_j_ang_line = WSB_separatedContent[1]; } else if (WSB_separatedContent.Length == 2 & (WSB_separatedContent[0].ToLower().Contains("tytul calosci") | WSB_separatedContent[0].Contains("Tytul calosci") | WSB_separatedContent[0] == "Tytul calosci")) { WSB_Tytul_calosci = WSB_separatedContent[1]; } else if (WSB_separatedContent.Length == 2 & (WSB_separatedContent[0].ToLower().Contains("doi") | WSB_separatedContent[0].Contains("DOI") | WSB_separatedContent[0] == "DOI")) { WSB_DOI = WSB_separatedContent[1]; } else if (WSB_separatedContent.Length == 2 & (WSB_separatedContent[0].ToLower().Contains("tytul pracy w innym") | WSB_separatedContent[0].Contains("Tytul pracy w innym") | WSB_separatedContent[0] == "Tytul pracy w innym jezyku")) { WSB_Tytul_pracy_w_innym_j = WSB_separatedContent[1]; } else if (WSB_separatedContent.Length == 2 & (WSB_separatedContent[0].ToLower().Contains("szczegoly") | WSB_separatedContent[0].Contains("Szczegoly") | WSB_separatedContent[0] == "Szczegoly")) { WSB_Szczegoly = WSB_separatedContent[1]; } else if (WSB_separatedContent.Length == 2 & (WSB_separatedContent[0].ToLower().Contains("url") | WSB_separatedContent[0].Contains("Url") | WSB_separatedContent[0] == "Adres url")) { WSB_URL = WSB_separatedContent[1]; } p++; } #region Old_iteration_method /* -- 21.08.2018 Old wersion of iteration * for (int i = 0; i <= hapDoc.DocumentNode.InnerText.Length; i++) * { * WSB_line = sr.ReadLine(); * if (WSB_line != null) * { * WSB_newcontent[i] = WSB_line; * WSB_separatedContent = WSB_line.Split(line_separator, 2); * * * if (WSB_separatedContent.Length == 2 & (WSB_separatedContent[0].ToLower().Contains("autor") | WSB_separatedContent[0].Contains("Autor") | WSB_separatedContent[0] == "Autorzy")) * { * //System.Windows.MessageBox.Show(WSB_separatedContent[1]); * WSB_autors = WSB_separatedContent[1].Split(autor_separators, StringSplitOptions.RemoveEmptyEntries); * WSB_author_line = WSB_separatedContent[1]; * } * * else if (WSB_separatedContent.Length == 2 & (WSB_separatedContent[0].Contains("Liczba odnalezionych") | WSB_separatedContent[0] == "Liczba odnalezionych rekordow")) * { * WSB_articles_Count = Convert.ToInt32(WSB_separatedContent[1]); * WSB_articles_Matrix = new string[WSB_articles_Count]; * for (int z = 0; z <= WSB_articles_Count - 1; z++) * { * WSB_articles_Matrix[z] = (z + 1) + "."; * } * } * * else if (WSB_separatedContent.Length == 1 & WSB_articles_Matrix.Any(x => WSB_separatedContent[0].Contains(x))) * { * if (WSB_author_line != null & WSB_Tytul_pracy != null) * { * using(var dbContext = new ArticleDBDataModelContainer()) * { * var document = new StringBuilder(); * var wsb_article = dbContext.WSB_ArticlesSet.Create(); * * if (WSB_author_line == null) * { * WSB_author_line = "Not_defined"; * } * wsb_article.article_authors = WSB_author_line; * WSB_author_line = null; * * if (WSB_Tytul_pracy == null) * { * WSB_Tytul_pracy = "Not_defined"; * } * wsb_article.article_title = WSB_Tytul_pracy; * if (WSB_Tytul_pracy != String.Empty | WSB_Tytul_pracy != " " | WSB_Tytul_pracy != null) * { * var termTitle_WSB = TextPreparing.TermsPrepataions(WSB_Tytul_pracy); * document.Append(termTitle_WSB); * } * WSB_Tytul_pracy = null; * * if (WSB_Adres_wydawniczy == null) * { * WSB_Adres_wydawniczy = "Not_defined"; * } * wsb_article.article_publisher_adres = WSB_Adres_wydawniczy; * WSB_Adres_wydawniczy = null; * * if (WSB_Tytul_calosci == null) * { * WSB_Tytul_calosci = "Not_defined"; * } * wsb_article.article_common_title = WSB_Tytul_calosci; * if (WSB_Tytul_calosci != String.Empty | WSB_Tytul_calosci != " " | WSB_Tytul_calosci != null) * { * var termFullTitle_WSB = TextPreparing.TermsPrepataions(WSB_Tytul_calosci); * document.Append(termFullTitle_WSB); * } * WSB_Tytul_calosci = null; * * if (WSB_Slowa_kluczowe_j_pl_line == null) * { * WSB_Slowa_kluczowe_j_pl_line = "Not_defined"; * } * wsb_article.article_pl_keywords = WSB_Slowa_kluczowe_j_pl_line; * if (WSB_Slowa_kluczowe_j_pl_line != String.Empty | WSB_Slowa_kluczowe_j_pl_line != " " | WSB_Slowa_kluczowe_j_pl_line != null) * { * var term_PL_Keywords_WSB = TextPreparing.TermsPrepataions(WSB_Slowa_kluczowe_j_pl_line); * document.Append(term_PL_Keywords_WSB); * } * WSB_Slowa_kluczowe_j_pl_line = null; * * if (WSB_Slowa_kluczowe_j_ang_line == null) * { * WSB_Slowa_kluczowe_j_ang_line = "Not_defined"; * } * wsb_article.article_eng_keywords = WSB_Slowa_kluczowe_j_ang_line; * if (WSB_Slowa_kluczowe_j_ang_line != String.Empty | WSB_Slowa_kluczowe_j_ang_line != " " | WSB_Slowa_kluczowe_j_ang_line != null) * { * var term_Eng_Keywords_WSB = TextPreparing.TermsPrepataions(WSB_Slowa_kluczowe_j_ang_line); * document.Append(term_Eng_Keywords_WSB); * } * WSB_Slowa_kluczowe_j_ang_line = null; * * if (WSB_Tytul_pracy_w_innym_j == null) * { * WSB_Tytul_pracy_w_innym_j = "Not_defined"; * } * wsb_article.article_title_other_lang = WSB_Tytul_pracy_w_innym_j; * if (WSB_Tytul_pracy_w_innym_j != String.Empty | WSB_Tytul_pracy_w_innym_j != " " | WSB_Tytul_pracy_w_innym_j != null) * { * var term_Title_Other_Lang_WSB = TextPreparing.TermsPrepataions(WSB_Tytul_pracy_w_innym_j); * document.Append(term_Title_Other_Lang_WSB); * } * WSB_Tytul_pracy_w_innym_j = null; * * if (WSB_Szczegoly == null) * { * WSB_Szczegoly = "Not_defined"; * } * wsb_article.article_details = WSB_Szczegoly; * WSB_Szczegoly = null; * * if (WSB_URL == null) * { * WSB_URL = "Not_defined"; * } * wsb_article.article_URL = WSB_URL; * WSB_URL = null; * * if (WSB_DOI == null) * { * WSB_DOI = "Not_defined"; * } * wsb_article.article_DOI = WSB_DOI; * WSB_DOI = null; * * * for (int k = 0; k <= WSB_autors.Length - 2;) * { * var authors_of_the_article = dbContext.AuthorSet.Create(); * authors_of_the_article.author_name = WSB_autors[k]; * authors_of_the_article.author_surename = WSB_autors[k + 1]; * wsb_article.Author.Add(authors_of_the_article); * k += 2; * } * * dbContext.WSB_ArticlesSet.Add(wsb_article); * * var _document = document.ToString().Split(' ', ';', ':', ','); * for (int k = 0; k <= _document.Length - 1; k++) * { * var terms = dbContext.Terms_Vocabulary.Create(); * // * string dictionary_text = File.ReadAllText(@"F:\Magistry files\csv_files\Allowed_term_dictionary.csv"); * string[] allowed_dictionary = dictionary_text.Split(',', '\n'); * * for (int p = 0; p <= _document.Length - 1; p++) * { * for (int j = 0; j <= allowed_dictionary.Length - 1; j++) * { * if (_document[p].Length > 3 & _document[p].Contains(allowed_dictionary[j])) * { * continue; * } * else if (_document[p].Length <= 3 & !(_document[p].Contains(allowed_dictionary[j]))) * { * _document.ToList().RemoveAt(p); * } * * } * } * * //tutaj potrzebnie przepisac id dokumenta w ktorym wystepuje dane slowo * if (_document[k] != String.Empty | _document[k] != " " | _document[k] != null | _document[k] != Char.IsDigit(' ').ToString()) * { * //dbContext.Terms_Vocabulary.Where(u) * var termVocabularyTable = dbContext.Terms_Vocabulary; * terms.term_value = _document[k]; * * } * wsb_article.Terms_Vocabulary.Add(terms); * } * try * { * dbContext.SaveChanges(); * } * catch (Exception ex) * { * File.WriteAllText(@"F:\\Magistry files\WSB_crawler_Log.txt", ex.ToString()); * } * * } * } * * else * { * //return; * //System.Windows.MessageBox.Show("brak danych!"); * //File.WriteAllText(@"F:\\Magistry files\WSB_emptyLines.txt", "empty_line"); * continue; * } * * } * * else if (WSB_separatedContent.Length == 2 & (WSB_separatedContent[0].ToLower().Contains("tytul pracy") | WSB_separatedContent[0].Contains("Tytul pracy") | WSB_separatedContent[0] == "Tytul pracy")) * { * //System.Windows.MessageBox.Show(WSB_separatedContent[1]); * WSB_Tytul_pracy = WSB_separatedContent[1]; * } * * else if (WSB_separatedContent.Length == 2 & (WSB_separatedContent[0].ToLower().Contains("adres wydawniczy") | WSB_separatedContent[0].Contains("Adres wydawniczy") | WSB_separatedContent[0] == "Adres wydawniczy")) * { * //System.Windows.MessageBox.Show(WSB_separatedContent[1]); * WSB_Adres_wydawniczy = WSB_separatedContent[1]; * } * * * else if (WSB_separatedContent.Length == 2 & (WSB_separatedContent[0].ToLower().Contains("polskie hasla") | WSB_separatedContent[0].Contains("Polskie hasla") | WSB_separatedContent[0] == "Polskie hasla przedmiotowe")) * { * //System.Windows.MessageBox.Show(WSB_separatedContent[1]); * WSB_Slowa_kluczowe_j_pl = WSB_separatedContent[1].Split(separators, StringSplitOptions.RemoveEmptyEntries); * WSB_Slowa_kluczowe_j_pl_line = WSB_separatedContent[1]; * } * * else if (WSB_separatedContent.Length == 2 & (WSB_separatedContent[0].ToLower().Contains("angielskie hasla") | WSB_separatedContent[0].Contains("Angielskie hasla") | WSB_separatedContent[0] == "Angielskie hasla przedmiotowe")) * { * //System.Windows.MessageBox.Show(WSB_separatedContent[1]); * WSB_Slowa_kluczowe_j_ang = WSB_separatedContent[1].Split(separators, StringSplitOptions.RemoveEmptyEntries); * WSB_Slowa_kluczowe_j_ang_line = WSB_separatedContent[1]; * } * * else if (WSB_separatedContent.Length == 2 & (WSB_separatedContent[0].ToLower().Contains("tytul calosci") | WSB_separatedContent[0].Contains("Tytul calosci") | WSB_separatedContent[0] == "Tytul calosci")) * { * //System.Windows.MessageBox.Show(WSB_separatedContent[1]); * WSB_Tytul_calosci = WSB_separatedContent[1]; * } * * else if (WSB_separatedContent.Length == 2 & (WSB_separatedContent[0].ToLower().Contains("doi") | WSB_separatedContent[0].Contains("DOI") | WSB_separatedContent[0] == "DOI")) * { * //System.Windows.MessageBox.Show(WSB_separatedContent[1]); * WSB_DOI = WSB_separatedContent[1]; * } * * else if (WSB_separatedContent.Length == 2 & (WSB_separatedContent[0].ToLower().Contains("tytul pracy w innym") | WSB_separatedContent[0].Contains("Tytul pracy w innym") | WSB_separatedContent[0] == "Tytul pracy w innym jezyku")) * { * //System.Windows.MessageBox.Show(WSB_separatedContent[1]); * WSB_Tytul_pracy_w_innym_j = WSB_separatedContent[1]; * } * * else if (WSB_separatedContent.Length == 2 & (WSB_separatedContent[0].ToLower().Contains("szczegoly") | WSB_separatedContent[0].Contains("Szczegoly") | WSB_separatedContent[0] == "Szczegoly")) * { * //System.Windows.MessageBox.Show(WSB_separatedContent[1]); * WSB_Szczegoly = WSB_separatedContent[1]; * } * * else if (WSB_separatedContent.Length == 2 & (WSB_separatedContent[0].ToLower().Contains("url") | WSB_separatedContent[0].Contains("Url") | WSB_separatedContent[0] == "Adres url")) * { * //System.Windows.MessageBox.Show(WSB_separatedContent[1]); * WSB_URL = WSB_separatedContent[1]; * } * * //else if (PP_separatedContent.Length == 1 & PP_separatedContent[0] == String.Empty) System.Windows.MessageBox.Show("The empty line detected", "Empty line", System.Windows.MessageBoxButton.OK); * //else System.Windows.MessageBox.Show("Error! Content not found!", "Error!", System.Windows.MessageBoxButton.OK); * } * } */ #endregion } }
public static void get_UMK_Document_content() { string[] UMK_newcontent = new string[hapDoc.DocumentNode.InnerText.Length]; string[] UMK_separatedContent = new string[hapDoc.DocumentNode.InnerText.Length]; UMK_articles_Count = 0; string[] PP_articles_Matrix = { String.Empty }; using (StringReader sr = new StringReader(endText)) { string UMK_line; for (int i = 0; i <= hapDoc.DocumentNode.InnerText.Length; i++) { UMK_line = sr.ReadLine(); if (UMK_line != null) { UMK_newcontent[i] = UMK_line; UMK_separatedContent = UMK_line.Split(line_separator, 2); //tutaj idzie funkcjonalnosc if (UMK_separatedContent.Length == 1 & UMK_separatedContent[0] == "") { continue; } else if (UMK_separatedContent.Length == 1 & PP_articles_Matrix.Any(x => UMK_separatedContent[0].Contains(x))) { if (UMK_author_line != null && UMK_Tytul != null) { using (var dbContext = new ArticleDBDataModelContainer()) { var document = new StringBuilder(); var umk_article = dbContext.UMK_ArticlesSet.Create(); if (UMK_author_line == null) { UMK_author_line = "Not_defined"; } umk_article.article_author_line = UMK_author_line; UMK_author_line = null; if (UMK_Tytul == null) { UMK_Tytul = "Not_defined"; } umk_article.article_title = UMK_Tytul; if (UMK_Tytul != String.Empty | UMK_Tytul != " " | UMK_Tytul != null) { var termTitle_UMK = TextPreparing.TermsPrepataions(UMK_Tytul); document.Append(termTitle_UMK); } UMK_Tytul = null; if (UMK_Pelny_tytul_czasop == null) { UMK_Pelny_tytul_czasop = "Not_defined"; } umk_article.article_Full_title = UMK_Pelny_tytul_czasop; if (UMK_Pelny_tytul_czasop != String.Empty | UMK_Pelny_tytul_czasop != " " | UMK_Pelny_tytul_czasop != null) { var termFullTitle_UMK = TextPreparing.TermsPrepataions(UMK_Pelny_tytul_czasop); document.Append(termFullTitle_UMK); } UMK_Pelny_tytul_czasop = null; if (UMK_Jezyk_Publikacji == null) { UMK_Jezyk_Publikacji = "Not_defined"; } umk_article.article_language = UMK_Jezyk_Publikacji; UMK_Jezyk_Publikacji = null; if (UMK_Tytul_rownolegly == null) { UMK_Tytul_rownolegly = "Not_defined"; } umk_article.article_translated_title = UMK_Tytul_rownolegly; if (UMK_Tytul_rownolegly != String.Empty | UMK_Tytul_rownolegly != " " | UMK_Tytul_rownolegly != null) { var termParallelTitle_UMK = TextPreparing.TermsPrepataions(UMK_Tytul_rownolegly); document.Append(termParallelTitle_UMK); } UMK_Tytul_rownolegly = null; if (UMK_en_keywords_line == null) { UMK_en_keywords_line = "Not_defined"; } umk_article.article_eng_keywords = UMK_en_keywords_line; if (UMK_en_keywords_line != String.Empty | UMK_en_keywords_line != " " | UMK_en_keywords_line != null) { var term_Eng_Keywords_UMK = TextPreparing.TermsPrepataions(UMK_en_keywords_line); document.Append(term_Eng_Keywords_UMK); } UMK_en_keywords_line = null; if (UMK_pl_keywords_line == null) { UMK_pl_keywords_line = "Not_defined"; } umk_article.article_pl_keywords = UMK_pl_keywords_line; if (UMK_pl_keywords_line != String.Empty | UMK_pl_keywords_line != " " | UMK_pl_keywords_line != null) { var term_PL_Keywords_UMK = TextPreparing.TermsPrepataions(UMK_pl_keywords_line); document.Append(term_PL_Keywords_UMK); } UMK_pl_keywords_line = null; if (UMK_Adres_URL == null) { UMK_Adres_URL = "Not_defined"; } umk_article.article_url = UMK_Adres_URL; UMK_Adres_URL = null; if (UMK_Tytul_Wydawn_Zbior == null) { UMK_Tytul_Wydawn_Zbior = "Not_defined"; } umk_article.article_publisher_title = UMK_Tytul_Wydawn_Zbior; UMK_Tytul_Wydawn_Zbior = null; if (UMK_Opis_wydawn == null) { UMK_Opis_wydawn = "Not_defined"; } umk_article.article_publisher_desc = UMK_Opis_wydawn; UMK_Opis_wydawn = null; for (int k = 0; k <= UMK_autors.Length - 2;) { var authors_of_the_article = dbContext.AuthorSet.Create(); authors_of_the_article.author_name = UMK_autors[k]; authors_of_the_article.author_surename = UMK_autors[k + 1]; umk_article.Author.Add(authors_of_the_article); k += 2; } dbContext.UMK_ArticlesSet.Add(umk_article); //dbContext.Configuration.ValidateOnSaveEnabled = false; var _document = document.ToString().Split(' ', ';', ':', ','); for (int k = 0; k <= _document.Length - 1; k++) { var terms = dbContext.Terms_Vocabulary.Create(); string dictionary_text = File.ReadAllText(@"F:\Magistry files\csv_files\Allowed_term_dictionary.csv"); string[] allowed_dictionary = dictionary_text.Split(',', '\n'); for (int p = 0; p <= _document.Length - 1; p++) { for (int j = 0; j <= allowed_dictionary.Length - 1; j++) { if (_document[p].Length > 3 && _document[p].Contains(allowed_dictionary[j])) { continue; } else if (_document[p].Length <= 3 && !(_document[p].Contains(allowed_dictionary[j]))) { _document.ToList().RemoveAt(p); } } } //tutaj potrzebnie przepisac id dokumenta w ktorym wystepuje dane slowo if (_document[k] != String.Empty | _document[k] != " " | _document[k] != null | _document[k] != Char.IsDigit(' ').ToString()) { //dbContext.Terms_Vocabulary.Where(u) var termVocabularyTable = dbContext.Terms_Vocabulary; terms.term_value = _document[k]; } umk_article.Terms_Vocabulary.Add(terms); } try { dbContext.SaveChanges(); } catch (Exception ex) { File.WriteAllText(@"F:\\Magistry files\UMK_crawler_Log.txt", ex.ToString()); } } } else { continue; } } else if (UMK_separatedContent.Length == 1 & (UMK_separatedContent[0].ToLower().Contains("http://") | UMK_separatedContent[0].ToLower().Contains("https://") | UMK_separatedContent[0].Contains("http://") | UMK_separatedContent[0].Contains("https://"))) { UMK_Adres_URL = UMK_separatedContent[0]; } else if (UMK_separatedContent.Length == 2 & (UMK_separatedContent[0].ToLower().Contains("aut.") | UMK_separatedContent[0].Contains("Aut.") | UMK_separatedContent[0] == "Aut.")) { UMK_autors = UMK_separatedContent[1].Split(separators, StringSplitOptions.RemoveEmptyEntries); UMK_author_line = UMK_separatedContent[1]; } else if (UMK_separatedContent.Length == 2 & (UMK_separatedContent[0].ToLower().Contains("Zapyt") | UMK_separatedContent[0].Contains("zapyt") | UMK_separatedContent[0] == "Zapytanie" | UMK_separatedContent[0] == "zapytanie")) { UMK_Zapytanie_Wyszukiwania = "SELECT * FROM UMK_Splendor_Expertus_article_database WHERE article LIKE " + UMK_separatedContent[1]; } else if (UMK_separatedContent.Length == 2 & (UMK_separatedContent[0].Contains("Liczba odnalezionych") | UMK_separatedContent[0] == "Liczba odnalezionych rekordow")) { UMK_articles_Count = Convert.ToInt32(UMK_separatedContent[1]); PP_articles_Matrix = new string[UMK_articles_Count]; for (int z = 0; z <= UMK_articles_Count - 1; z++) { PP_articles_Matrix[z] = (z + 1) + "."; } } else if (UMK_separatedContent.Length >= 2 & (UMK_separatedContent[0].ToLower().Contains("tytu") | UMK_separatedContent[0].ToLower().Contains("tytuł") | UMK_separatedContent[0].ToLower().Contains("tytul") | UMK_separatedContent[0].Contains("TYTUŁ") | UMK_separatedContent[0] == "Tytuł" | UMK_separatedContent[0] == "Tytul")) { UMK_Tytul = UMK_separatedContent[1]; } else if (UMK_separatedContent.Length >= 2 & (UMK_separatedContent[0].ToLower().Contains("opis wydawn.") | UMK_separatedContent[0].ToLower().Contains("opis wydawn") | UMK_separatedContent[0].Contains("Opis wydawn.") | UMK_separatedContent[0].Contains("Opis wydawn") | UMK_separatedContent[0] == "Opis wydawn.")) { UMK_Opis_wydawn = UMK_separatedContent[1]; } else if (UMK_separatedContent.Length == 2 & (UMK_separatedContent[0].ToLower().Contains("język") | UMK_separatedContent[0].ToLower().Contains("jezyk") | UMK_separatedContent[0].Contains("Język") | UMK_separatedContent[0].Contains("Jezyk") | UMK_separatedContent[0] == "Język" | UMK_separatedContent[0] == "Jezyk")) { UMK_Jezyk_Publikacji = UMK_separatedContent[1]; } else if (UMK_separatedContent.Length == 2 & (UMK_separatedContent[0].ToLower().Contains("polskie słowa kluczowe") | UMK_separatedContent[0].ToLower().Contains("polskie slowa kluczowe") | UMK_separatedContent[0].Contains("Polskie słowa kluczowe") | UMK_separatedContent[0].Contains("Polskie slowa kluczowe") | UMK_separatedContent[0].Contains("Polskie slo") | UMK_separatedContent[0].Contains("polskie slo") | UMK_separatedContent[0] == "Polskie słowa kluczowe" | UMK_separatedContent[0] == "Polskie slowa kluczowe")) { UMK_Slowa_kluczowe_j_pl = UMK_separatedContent[1].Split(separators); UMK_pl_keywords_line = UMK_separatedContent[1]; } else if (UMK_separatedContent.Length == 2 & (UMK_separatedContent[0].ToLower().Contains("tytuł wydawn. zbior.") | UMK_separatedContent[0].ToLower().Contains("tytul wydawn. zbior.") | UMK_separatedContent[0].Contains("Tytuł wydawn. zbior.") | UMK_separatedContent[0].Contains("Tytul wydawn. zbior.") | UMK_separatedContent[0] == "Tytuł wydawn. zbior." | UMK_separatedContent[0] == "Tytul wydawn. zbior.")) { UMK_Tytul_Wydawn_Zbior = UMK_separatedContent[1]; } else if ((UMK_separatedContent.Length == 2 & (UMK_separatedContent[0].ToLower().Contains("pełny tytuł czasop.") | UMK_separatedContent[0].ToLower().Contains("pelny tytul czasop.") | UMK_separatedContent[0].Contains("Pełny tytuł czasop.") | UMK_separatedContent[0].Contains("Pelny tytul czasop.") | UMK_separatedContent[0] == "Pełny tytuł czasop." | UMK_separatedContent[0] == "Pelny tytul czasop."))) { UMK_Pelny_tytul_czasop = UMK_separatedContent[1]; } else if ((UMK_separatedContent.Length == 2 & (UMK_separatedContent[0].ToLower().Contains("tytuł równoległy") | UMK_separatedContent[0].ToLower().Contains("Tytul rownolegly") | UMK_separatedContent[0] == "Tytuł równoległy" | UMK_separatedContent[0] == "Tytul rownolegly"))) { UMK_Tytul_rownolegly = UMK_separatedContent[1]; } else if (UMK_separatedContent.Length == 2 & (UMK_separatedContent[0].ToLower().Contains("angielskie słowa kluczowe") | UMK_separatedContent[0].ToLower().Contains("angielskie slowa kluczowe") | UMK_separatedContent[0].Contains("Angielskie słowa kluczowe") | UMK_separatedContent[0].Contains("angielskie słowa kluczowe ") | UMK_separatedContent[0] == "Angielskie słowa kluczowe" | UMK_separatedContent[0] == "angielskie słowa kluczowe")) { UMK_Slowa_kluczowe_j_ang = UMK_separatedContent[1].Split(separators); UMK_en_keywords_line = UMK_separatedContent[1]; } } } } }
public static void LoadBibtexFile() { string[] fileEntries = Directory.GetFiles(filePathBibtex); char[] not_allowedCharsforArticle = { '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '<', '>', 'x', '!', '#', '$', '%', '^', '&', '*', '(', ')', '/', '\'' }; string[] new_document = new string[0]; foreach (string file in fileEntries) { using (StreamReader reader = new StreamReader(File.OpenRead(file))) { if (reader.ToString() != null || !reader.ToString().Contains("title =")) { context = new string[14]; separatedContext = new string[2]; for (int i = 0; i <= context.Count() - 1; i++) { context[i] = reader.ReadLine(); if (context[i] != null || context[i] == "}") { try { Console.WriteLine("Processing " + i.ToString() + " line."); context[i] = context[i].TrimStart(' ').Replace('\"', ' ').Replace('\\', ' ').TrimEnd(','); separatedContext = context[i].Split(separators, 2, StringSplitOptions.RemoveEmptyEntries); #region getVariables if (separatedContext[0].Contains("title")) { #region little_modification_for_title_clearing /* * for (int a = 0; a < separatedContext[1].Length; a++) * { * for (int b = 0; b < not_allowedCharsforArticle.Length; b++) * { * if (separatedContext[1].ElementAt(a) == not_allowedCharsforArticle[b]) * separatedContext[1].Remove(a, 1); * } * } */ #endregion if (separatedContext[1].Length >= 2) { _title = separatedContext[1]; } } else if (separatedContext[0].Contains("abstract")) { #region little_modification_for_abstract_clearing for (int a = 0; a < separatedContext[1].Length; a++) { for (int b = 0; b < not_allowedCharsforArticle.Length; b++) { if (separatedContext[1].ElementAt(a) == not_allowedCharsforArticle[b]) { separatedContext[1].Remove(a, 1); } } } #endregion if (separatedContext[1].Length >= 5) { _abstract = separatedContext[1]; } } else if (separatedContext[0].Contains("keywords")) { if (separatedContext[1] != String.Empty || separatedContext[1] != " ") { _keywords = separatedContext[1]; } else { continue; } } else if (separatedContext[0].Contains("year")) { //year filter //if (Convert.ToInt32(separatedContext[1]) >= 1960) _year = Convert.ToInt32(separatedContext[1]); //else continue; } else if (separatedContext[0].Contains("country")) { _country = separatedContext[1]; } else if (separatedContext[0].Contains("author")) { _authorsLine = separatedContext[1]; _authors = separatedContext[1].Split(authorSeparator, StringSplitOptions.RemoveEmptyEntries); } else if (separatedContext[0].Contains("organization")) { _organization = separatedContext[1]; } else if (separatedContext[0].Contains("url")) { _url = separatedContext[1]; } else { continue; } #endregion } catch (Exception ex) { //if (ex.InnerException.GetType() == typeof(IndexOutOfRangeException)) //{ //File.WriteAllText(@"F:\\Magistry files\PG_crawler_Log.txt", ex.ToString()); //return; //} continue; } } } } } #region bibtexLibrary /* * if(reader.ToString() != null) * { * string fileEntry = reader.ReadToEnd(); * string fileEntry_filter1 = fileEntry.Replace('*', ' '); * //string fileEntry_filter2 = fileEntry_filter1.Replace('{', ' '); * // string fileEntry_filter3 = fileEntry_filter2.Replace('}', ' '); * string fileEntry_filter2 = fileEntry_filter1.Replace('/', ' '); * if (fileEntry_filter2!=String.Empty && fileEntry_filter2.Contains("title = ") && fileEntry_filter2 != null) * { * BibTeXLibrary.BibParser parser = new BibParser(new StringReader(fileEntry)); * var entry = parser.GetAllResult()[0]; * if(!entry.ToString().Contains("publication100010")) * { * Console.WriteLine(entry["title"]); * Console.WriteLine(entry["abstract"]); * Console.WriteLine(entry["keywords"]); * Console.WriteLine(entry["year"]); * Console.WriteLine(entry["author"]); * Console.WriteLine(entry["organization"]); * Console.WriteLine(entry["url"]); * } * else * { * file.Skip(1); * } * } * else if (fileEntry_filter2 == String.Empty || !fileEntry_filter2.Contains("title = ") || fileEntry_filter2 == null) * { * file.Skip(1); * } * else{ * Console.WriteLine("Error!"); * return; * } */ #endregion try { #region Bibtex_Entity_Object_Creation_Model_First // using (var dbContext = new ArticleDBDataModelContainer()) { var document = new StringBuilder(); var bibtexArticle = dbContext.PG_ArticlesSet.Create(); bibtexArticle.title = _title; if (_title != String.Empty || _title != " " || _title != null) { var termTitle = TextPreparing.TermsPrepataions(_title); document.Append(termTitle); } _title = null; bibtexArticle.abstractText = _abstract; if (_abstract != String.Empty || _abstract != " " || _abstract != null) { var termAbstract = TextPreparing.TermsPrepataions(_abstract); document.Append(termAbstract); } _abstract = null; bibtexArticle.keywords = _keywords; if (_keywords != String.Empty || _keywords != " " || _keywords != null) { var termKeywords = TextPreparing.TermsPrepataions(_keywords); document.Append(termKeywords); } _keywords = null; bibtexArticle.year = _year; bibtexArticle.country = _country; _country = null; bibtexArticle.authors = _authorsLine; _authorsLine = null; //potrzebnie dorobic dodawanie autorow po 2 wartosci z tabeli authors[] do klasy Entity Authors bibtexArticle.organizations = _organization; _organization = null; bibtexArticle.url = _url; _url = null; for (int i = 0; i <= _authors.Length - 2;) { var authors_of_the_article = dbContext.AuthorSet.Create(); authors_of_the_article.author_name = _authors[i]; authors_of_the_article.author_surename = _authors[i + 1]; bibtexArticle.Author.Add(authors_of_the_article); i += 2; } dbContext.PG_ArticlesSet.Add(bibtexArticle); var _document = document.ToString().Split(' ', ';', ':', ','); //dodano 11.02 for (int p = 0; p < _document.Length; p++) { for (int z = 0; z < not_allowedCharsforArticle.Length; z++) { if (_document[p].Contains(not_allowedCharsforArticle[z])) { _document[p].Remove(z, 1); } } //dodano 11.02 List <string> stringHashSet = new List <string>(); stringHashSet = _document.ToList(); foreach (var element in stringHashSet) { if (element == String.Empty || element == null || element == " ") { stringHashSet.Remove(element); } else if (element.Length <= 3) { stringHashSet.Remove(element); } } new_document = stringHashSet.ToArray(); } for (int k = 0; k <= new_document.Length - 1; k++) { var terms = dbContext.Terms_Vocabulary.Create(); string dictionary_text = File.ReadAllText(@"F:\Magistry files\csv_files\Allowed_term_dictionary.csv"); string[] allowed_dictionary = dictionary_text.Split(',', '\n'); #region old_cleaning_code_11.02.2018 //added 10.02.2018 - cleaninig the article list /* * for (int i = 0; i <= new_document.Length - 1; i++) * { * for (int j = 0; j <= allowed_dictionary.Length - 1; j++) * { * if (new_document[i].Length > 3 && new_document[i].Contains(allowed_dictionary[j])) * { * continue; * } * else if (new_document[i].Length < 3 && !(new_document[i].Contains(allowed_dictionary[j]))) * { * new_document.ToList().RemoveAt(i); * } * } * } */ #endregion #region old_version_11.02.2018 //tutaj potrzebnie przepisac id dokumenta w ktorym wystepuje dane slowo //if (new_document[k] != String.Empty || new_document[k] != " " || new_document[k] != null || new_document[k] != Char.IsDigit(' ').ToString()) //{ //dbContext.Terms_Vocabulary.Where(u) #endregion //var termVocabularyTable = dbContext.Terms_Vocabulary; /* 21.08 dont't work properly - under fix * // 21.08 If need fast but not accurate - don't use this * for (int i=0; i<k; i++) * { * var query = GetTerms_Vocabulary(dbContext); * var query_list = new List<Terms_Vocabulary>(); * foreach(var element in query) * { * query_list = query.ToList(); * } * //if (query_list.Count == 0) * for(int j = 0; i < query_list.Count; j++) * { * if (query_list[j].term_value != new_document[k] | !(query_list[j].term_value.Contains(new_document[k]))) * { * terms.term_value = new_document[k]; * bibtexArticle.Terms_Vocabulary.Add(terms); * } * else * continue; * } * } * // */ terms.term_value = new_document[k]; //-- 21.08 old and fast but not effective //} bibtexArticle.Terms_Vocabulary.Add(terms); //-- 21.08 old and fast but not effective } dbContext.SaveChanges(); } #endregion ///<summary> /// BibtexArticle_Entity_Object_Creation /// </summary> #region BibtexArticle_Entity_Object_Creation /* * using (var db = new PublicationsContext()) * { * var bibtexArticle = new BibtexArticle(); * bibtexArticle.title = _title; * _title = null; * bibtexArticle.abstractText = _abstract; * _abstract = null; * bibtexArticle.keywords = _keywords; * _keywords = null; * bibtexArticle.year = _year; * bibtexArticle.country = _country; * _country = null; * bibtexArticle.authors = _authorsLine; * _authorsLine = null; * //potrzebnie dorobic dodawanie autorow po 2 wartosci z tabeli authors[] do klasy Entity Authors * bibtexArticle.organizations = _organization; * _organization = null; * bibtexArticle.url = _url; * _url = null; * * * var authors_of_the_article = new Authors(); * for (int i = 0; i <= _authors.Length - 2; i++) * { * authors_of_the_article.author_name = _authors[i]; * authors_of_the_article.author_surename = _authors[i + 1]; * bibtexArticle.author_Id = authors_of_the_article.author_Id; * db.Authors.Add(authors_of_the_article); * } * * db.PG_Articles.Add(bibtexArticle); * db.SaveChanges(); * } */ #endregion Console.WriteLine("End of file! Go to the next ->"); } catch (Exception ex) { File.WriteAllText(@"F:\\Magistry files\PG_crawler_Log.txt", ex.ToString()); } } }