public static void get_UG_Document_content()
        {
            string[] UG_newcontent       = new string[hapDoc.DocumentNode.InnerText.Length];
            string[] UG_separatedContent = new string[hapDoc.DocumentNode.InnerText.Length];

            UG_articles_Count = 0;
            string[] UG_articles_Matrix = { String.Empty };

            using (StringReader sr = new StringReader(endText))
            {
                int    p = 0;
                string UG_line;

                while ((UG_line = sr.ReadLine()) != null)
                {
                    UG_newcontent[p]    = UG_line;
                    UG_separatedContent = UG_line.Split(line_separator, 2);

                    if (UG_separatedContent.Length == 1 & UG_separatedContent[0] == "")
                    {
                        continue;
                    }
                    else if (UG_separatedContent.Length == 1 && UG_articles_Matrix.Any(x => UG_separatedContent[0].Contains(x)))
                    {
                        if (UG_author_line != null && UG_Tytul != null)
                        {
                            using (var dbContext = new ArticleDBDataModelContainer())
                            {
                                var document   = new StringBuilder();
                                var ug_article = dbContext.UG_ArticlesSet.Create();

                                ug_article.article_author_line = UG_author_line;
                                UG_author_line = null;

                                ug_article.article_keywords = UG_slowa_kluczowe_j_ang_line;
                                if (UG_slowa_kluczowe_j_ang_line != String.Empty || UG_slowa_kluczowe_j_ang_line != " " || UG_slowa_kluczowe_j_ang_line != null)
                                {
                                    var termEngKeywords = TextPreparing.TermsPrepataions(UG_slowa_kluczowe_j_ang_line);
                                    document.Append(termEngKeywords);
                                }
                                UG_slowa_kluczowe_j_ang_line = null;

                                ug_article.article_source = UG_Zrodlo;
                                UG_Zrodlo = null;

                                ug_article.article_title = UG_Tytul;
                                if (UG_Tytul != String.Empty || UG_Tytul != " " || UG_Tytul != null)
                                {
                                    var term_UG_Title = TextPreparing.TermsPrepataions(UG_Tytul);
                                    document.Append(term_UG_Title);
                                }
                                UG_Tytul = null;

                                ug_article.article_DOI = UG_DOI;
                                UG_DOI = null;

                                for (int k = 0; k <= UG_autors.Length - 2;)
                                {
                                    var authors_of_the_article = dbContext.AuthorSet.Create();
                                    authors_of_the_article.author_name     = UG_autors[k];
                                    authors_of_the_article.author_surename = UG_autors[k + 1];
                                    ug_article.Author.Add(authors_of_the_article);
                                    k += 2;
                                }

                                dbContext.UG_ArticlesSet.Add(ug_article);

                                var _document = document.ToString().Split(' ', ';', ':', ',');
                                for (int k = 0; k <= _document.Length - 1; k++)
                                {
                                    var      terms              = dbContext.Terms_Vocabulary.Create();
                                    string   dictionary_text    = File.ReadAllText(@"F:\Magistry files\csv_files\Allowed_term_dictionary.csv");
                                    string[] allowed_dictionary = dictionary_text.Split(',', '\n');

                                    for (int d = 0; d <= _document.Length - 1; d++)
                                    {
                                        for (int j = 0; j <= allowed_dictionary.Length - 1; j++)
                                        {
                                            if (_document[d].Length > 3 && _document[d].Contains(allowed_dictionary[j]))
                                            {
                                                continue;
                                            }
                                            else if (_document[d].Length <= 3 && !(_document[d].Contains(allowed_dictionary[j])))
                                            {
                                                _document.ToList().RemoveAt(d);
                                            }
                                        }
                                    }

                                    //tutaj potrzebnie przepisac id dokumenta w ktorym wystepuje dane slowo
                                    if (_document[k] != String.Empty || _document[k] != " " || _document[k] != null || _document[k] != Char.IsDigit(' ').ToString() || dbContext.Terms_Vocabulary.Any(o => o.term_value != _document[k]))
                                    {
                                        //dbContext.Terms_Vocabulary.Where(u)
                                        var termVocabularyTable = dbContext.Terms_Vocabulary;
                                        terms.term_value = _document[k];
                                    }
                                    try
                                    {
                                        ug_article.Terms_Vocabulary.Add(terms);
                                    }
                                    catch (Exception addingTermToDB)
                                    {
                                        File.WriteAllText(@"F:\\Magistry files\UG_crawler_Log.txt", DateTime.Now.ToString() + addingTermToDB.ToString());
                                    }
                                }
                                try
                                {
                                    dbContext.SaveChanges();
                                }
                                catch (Exception ex)
                                {
                                    File.WriteAllText(@"F:\\Magistry files\UG_crawler_Log.txt", DateTime.Now.ToString() + ex.ToString());
                                }
                            }
                        }
                        else
                        {
                            File.WriteAllText(@"F:\\Magistry files\UG_crawler_Log.txt", "Empty line detected." + '\n');
                        }
                    }
                    else if (UG_separatedContent.Length == 2 && (UG_separatedContent[0].Contains("Liczba odnalezionych") || UG_separatedContent[0] == "Liczba odnalezionych rekordow"))
                    {
                        UG_articles_Count  = Convert.ToInt32(UG_separatedContent[1]);
                        UG_articles_Matrix = new string[UG_articles_Count];
                        for (int z = 0; z <= UG_articles_Count - 1; z++)
                        {
                            UG_articles_Matrix[z] = (z + 1) + ".";
                        }
                    }
                    else if (UG_separatedContent.Length == 2 && UG_separatedContent[0].ToLower().Contains("autorzy"))
                    {
                        UG_author_line = UG_separatedContent[1];
                        UG_autors      = UG_separatedContent[1].Split(autor_separators, StringSplitOptions.RemoveEmptyEntries);
                    }
                    else if (UG_separatedContent.Length == 2 && (UG_separatedContent[0].ToLower().Contains("tytu") || UG_separatedContent[0].ToLower().Contains("tytul") || UG_separatedContent[0].Contains("TYTUL") || UG_separatedContent[0] == "TYTUL[ROZDZIALU, FRAGMENTU]" || UG_separatedContent[0].Contains("TYTUL[ROZDZIALU, FRAGMENTU]") || UG_separatedContent[0].ToLower().Contains("TYTUL[ROZDZIALU, FRAGMENTU]")))
                    {
                        UG_Tytul = UG_separatedContent[1];
                    }
                    else if (UG_separatedContent.Length == 2 && UG_separatedContent[0].ToLower().Contains("zrodlo"))
                    {
                        UG_Zrodlo = UG_separatedContent[1];
                    }
                    else if (UG_separatedContent.Length == 2 && UG_separatedContent[0].Contains("Slowa kluczowe w j. ang."))
                    {
                        UG_Slowa_kluczowe_j_ang      = UG_separatedContent[1].Split(separators);
                        UG_slowa_kluczowe_j_ang_line = UG_separatedContent[1];
                    }
                    else if (UG_separatedContent.Length == 2 && (UG_separatedContent[0] == "DOI" || UG_separatedContent.Contains("DOI") || UG_separatedContent[0].ToLower().Contains("doi")))
                    {
                        UG_DOI = UG_separatedContent[1];
                    }
                    p++;
                }
                #region Old_reader_code
                // 21.08.2018 - Old version of code

                /*
                 * for (int i = 0; i <= hapDoc.DocumentNode.InnerText.Length; i++)
                 * {
                 *
                 *  UG_line = sr.ReadLine();
                 *  if (UG_line != null)
                 *  {
                 *      UG_newcontent[i] = UG_line;
                 *      UG_separatedContent = UG_line.Split(line_separator, 2);
                 *
                 *      if (UG_separatedContent.Length == 2 && UG_separatedContent[0].ToLower().Contains("autorzy"))
                 *      {
                 *          UG_author_line = UG_separatedContent[1];
                 *          UG_autors = UG_separatedContent[1].Split(autor_separators, StringSplitOptions.RemoveEmptyEntries);
                 *
                 *      }
                 *      else if (UG_separatedContent.Length == 2 && (UG_separatedContent[0].Contains("Liczba odnalezionych") || UG_separatedContent[0] == "Liczba odnalezionych rekordow"))
                 *      {
                 *          UG_articles_Count = Convert.ToInt32(UG_separatedContent[1]);
                 *          UG_articles_Matrix = new string[UG_articles_Count];
                 *          for (int z = 0; z <= UG_articles_Count - 1; z++)
                 *          {
                 *              UG_articles_Matrix[z] = (z + 1) + ".";
                 *          }
                 *      }
                 *      else if (UG_separatedContent.Length == 1 && UG_articles_Matrix.Any(x => UG_separatedContent[0].Contains(x)))
                 *      {
                 *          if (UG_author_line != null && UG_Tytul != null)
                 *          {
                 *              using(var dbContext = new ArticleDBDataModelContainer())
                 *              {
                 *                  var document = new StringBuilder();
                 *                  var ug_article = dbContext.UG_ArticlesSet.Create();
                 *
                 *                  ug_article.article_author_line = UG_author_line;
                 *                  UG_author_line = null;
                 *
                 *                  ug_article.article_keywords = UG_slowa_kluczowe_j_ang_line;
                 *                  if (UG_slowa_kluczowe_j_ang_line != String.Empty || UG_slowa_kluczowe_j_ang_line != " " || UG_slowa_kluczowe_j_ang_line != null)
                 *                  {
                 *                      var termEngKeywords = TextPreparing.TermsPrepataions(UG_slowa_kluczowe_j_ang_line);
                 *                      document.Append(termEngKeywords);
                 *                  }
                 *                  UG_slowa_kluczowe_j_ang_line = null;
                 *
                 *                  ug_article.article_source = UG_Zrodlo;
                 *                  UG_Zrodlo = null;
                 *
                 *                  ug_article.article_title = UG_Tytul;
                 *                  if (UG_Tytul != String.Empty || UG_Tytul != " " || UG_Tytul != null)
                 *                  {
                 *                      var term_UG_Title = TextPreparing.TermsPrepataions(UG_Tytul);
                 *                      document.Append(term_UG_Title);
                 *                  }
                 *                  UG_Tytul = null;
                 *
                 *                  ug_article.article_DOI = UG_DOI;
                 *                  UG_DOI = null;
                 *
                 *                  for (int k = 0; k <= UG_autors.Length - 2;)
                 *                  {
                 *                      var authors_of_the_article = dbContext.AuthorSet.Create();
                 *                      authors_of_the_article.author_name = UG_autors[k];
                 *                      authors_of_the_article.author_surename = UG_autors[k + 1];
                 *                      ug_article.Author.Add(authors_of_the_article);
                 *                      k += 2;
                 *                  }
                 *                  dbContext.UG_ArticlesSet.Add(ug_article);
                 *                  var _document = document.ToString().Split(' ', ';', ':', ',');
                 *                  for (int k = 0; k <= _document.Length - 1; k++)
                 *                  {
                 *                      var terms = dbContext.Terms_Vocabulary.Create();
                 *                      string dictionary_text = File.ReadAllText(@"F:\Magistry files\csv_files\Allowed_term_dictionary.csv");
                 *                      string[] allowed_dictionary = dictionary_text.Split(',', '\n');
                 *
                 *                      for (int d = 0; d <= _document.Length - 1; d++)
                 *                      {
                 *                          for (int j = 0; j <= allowed_dictionary.Length - 1; j++)
                 *                              if (_document[d].Length > 3 && _document[d].Contains(allowed_dictionary[j]))
                 *                                  continue;
                 *                              else if (_document[d].Length <= 3 && !(_document[d].Contains(allowed_dictionary[j])))
                 *                                  _document.ToList().RemoveAt(d);
                 *                      }
                 *
                 *                      //tutaj potrzebnie przepisac id dokumenta w ktorym wystepuje dane slowo
                 *                      if (_document[k] != String.Empty || _document[k] != " " || _document[k] != null || _document[k] != Char.IsDigit(' ').ToString())
                 *                      {
                 *                          //dbContext.Terms_Vocabulary.Where(u)
                 *                          var termVocabularyTable = dbContext.Terms_Vocabulary;
                 *                          terms.term_value = _document[k];
                 *
                 *                      }
                 *                      ug_article.Terms_Vocabulary.Add(terms);
                 *                  }
                 *                  try
                 *                  {
                 *                      dbContext.SaveChanges();
                 *                  }
                 *                  catch(Exception ex)
                 *                  {
                 *                      File.WriteAllText(@"F:\\Magistry files\UG_crawler_Log.txt", ex.ToString());
                 *                  }
                 *              }
                 *
                 *              ///<summary>
                 *              /// UGArticle_Entity_Object_Creation
                 *              /// </summary>
                 #region UGArticle_Entity_Object_Creation
                 *              using (var db = new PublicationsContext())
                 *              {
                 *                  var ug_article = new UGArticle();
                 *                  ug_article.article_author_line = UG_author_line;
                 *                  UG_author_line = null;
                 *                  ug_article.article_keywords = UG_slowa_kluczowe_j_ang_line;
                 *                  UG_slowa_kluczowe_j_ang_line = null;
                 *                  ug_article.article_source = UG_Zrodlo;
                 *                  UG_Zrodlo = null;
                 *                  ug_article.article_title = UG_Tytul;
                 *                  UG_Tytul = null;
                 *                  ug_article.article_DOI = UG_DOI;
                 *                  UG_DOI = null;
                 *
                 *                  var authors_of_the_article = new Authors();
                 *                  for (int k = 0; k <= UG_autors.Length - 2; k++)
                 *                  {
                 *                      authors_of_the_article.author_name = UG_autors[k];
                 *                      authors_of_the_article.author_surename = UG_autors[k + 1];
                 *                      authors_of_the_article.article_Id = ug_article.article_Id;
                 *
                 *                      db.Authors.Add(authors_of_the_article);
                 *                  }
                 *
                 *                  //authors_of_the_article.UG_Articles.Add(ug_article);
                 *                  db.UG_Articles.Add(ug_article);
                 *                  db.SaveChanges();
                 *              }
                 *          else
                 *          {
                 *              File.WriteAllText(@"F:\\Magistry files\UG_crawler_Log.txt", "Empty line detected."+'\n');
                 *          }
                 *      }
                 *      else if (UG_separatedContent.Length == 2 && (UG_separatedContent[0].ToLower().Contains("tytu") || UG_separatedContent[0].ToLower().Contains("tytul") || UG_separatedContent[0].Contains("TYTUL") || UG_separatedContent[0]=="TYTUL[ROZDZIALU, FRAGMENTU]" || UG_separatedContent[0].Contains("TYTUL[ROZDZIALU, FRAGMENTU]") || UG_separatedContent[0].ToLower().Contains("TYTUL[ROZDZIALU, FRAGMENTU]")))
                 *      {
                 *          UG_Tytul = UG_separatedContent[1];
                 *      }
                 *      else if (UG_separatedContent.Length == 2 && UG_separatedContent[0].ToLower().Contains("zrodlo")){
                 *          UG_Zrodlo = UG_separatedContent[1];
                 *      }
                 *      else if (UG_separatedContent.Length == 2 && UG_separatedContent[0].Contains("Slowa kluczowe w j. ang."))
                 *      {
                 *          UG_Slowa_kluczowe_j_ang = UG_separatedContent[1].Split(separators);
                 *          UG_slowa_kluczowe_j_ang_line = UG_separatedContent[1];
                 *      }
                 *      else if (UG_separatedContent.Length == 2 && (UG_separatedContent[0] == "DOI" || UG_separatedContent.Contains("DOI") || UG_separatedContent[0].ToLower().Contains("doi")))
                 *      {
                 *          UG_DOI = UG_separatedContent[1];
                 *      }
                 *  }
                 * }
                 */
                #endregion
            }
        }
Пример #2
0
        public static void get_PP_Document_content()
        {
            string[] PP_newcontent       = new string[hapDoc.DocumentNode.InnerText.Length];
            string[] PP_separatedContent = new string[hapDoc.DocumentNode.InnerText.Length];

            PP_articles_Count = 0;
            string[] PP_articles_Matrix = { String.Empty };

            using (StringReader sr = new StringReader(endText))
            {
                int    p = 0;
                string PP_line;

                while ((PP_line = sr.ReadLine()) != null)
                {
                    PP_newcontent[p]    = PP_line;
                    PP_separatedContent = PP_line.Split(line_separator, 2);

                    if (PP_separatedContent.Length == 1 & PP_separatedContent[0] == "")
                    {
                        continue;
                    }
                    else if (PP_separatedContent.Length == 1 && PP_articles_Matrix.Any(x => PP_separatedContent[0].Contains(x)))
                    {
                        if (PP_author_line != null && PP_Tytul != null)
                        {
                            try
                            {
                                using (var PPdbContext = new ArticleDBDataModelContainer())
                                {
                                    var document   = new StringBuilder();
                                    var pp_article = PPdbContext.PP_ArticlesSet.Create();

                                    pp_article.article_author_line = PP_author_line;
                                    PP_author_line = null;

                                    pp_article.article_title = PP_Tytul;
                                    if (PP_Tytul != String.Empty || PP_Tytul != " " || PP_Tytul != null)
                                    {
                                        var termTitlePP = TextPreparing.TermsPrepataions(PP_Tytul);
                                        document.Append(termTitlePP);
                                    }
                                    PP_Tytul = null;

                                    pp_article.article_source = PP_Zrodlo;
                                    if (PP_Zrodlo != String.Empty || PP_Zrodlo != " " || PP_Zrodlo != null)
                                    {
                                        var termSourcePP = TextPreparing.TermsPrepataions(PP_Zrodlo);
                                        document.Append(termSourcePP);
                                    }
                                    else
                                    {
                                        PP_Zrodlo = "Not defined";
                                        document.Append(PP_Zrodlo);
                                    }
                                    PP_Zrodlo = null;

                                    pp_article.article_year = PP_Rok;
                                    PP_Rok = 0;
                                    pp_article.article_language = PP_Jezyk_Publikacji;
                                    PP_Jezyk_Publikacji         = null;
                                    pp_article.article_DOI      = PP_DOI;
                                    PP_DOI = null;

                                    /*
                                     * pp_article.article_details = PP_Uwagi;
                                     * PP_Uwagi = null;
                                     * pp_article.article_URL = PP_Adres_URL;
                                     * PP_Adres_URL = null;
                                     */

                                    for (int z = 0; z <= PP_autors.Length - 4;)
                                    {
                                        var authors_of_the_PP_article = PPdbContext.AuthorSet.Create();
                                        if (PP_autors[z] != "IC)")
                                        {
                                            authors_of_the_PP_article.author_name     = PP_autors[z + 1];
                                            authors_of_the_PP_article.author_surename = PP_autors[z];
                                            pp_article.Author.Add(authors_of_the_PP_article);
                                        }
                                        z += 4;
                                    }
                                    PPdbContext.PP_ArticlesSet.Add(pp_article);

                                    var _document = document.ToString().Split(' ', ';', ':', ',');
                                    for (int k = 0; k <= _document.Length - 1; k++)
                                    {
                                        var terms = PPdbContext.Terms_Vocabulary.Create();

                                        string   dictionary_text    = File.ReadAllText(@"F:\Magistry files\csv_files\Allowed_term_dictionary.csv");
                                        string[] allowed_dictionary = dictionary_text.Split(',', '\n');

                                        for (int d = 0; d <= _document.Length - 1; d++)
                                        {
                                            for (int j = 0; j <= allowed_dictionary.Length - 1; j++)
                                            {
                                                if (_document[d].Length > 3 && _document[d].Contains(allowed_dictionary[j]))
                                                {
                                                    continue;
                                                }
                                                else if (_document[d].Length <= 3 && !(_document[d].Contains(allowed_dictionary[j])))
                                                {
                                                    _document.ToList().RemoveAt(d);
                                                }
                                            }
                                        }
                                        //tutaj potrzebnie przepisac id dokumenta w ktorym wystepuje dane slowo
                                        if (_document[k] != String.Empty || _document[k] != " " || _document[k] != null || _document[k] != Char.IsDigit(' ').ToString())
                                        {
                                            //dbContext.Terms_Vocabulary.Where(u)
                                            var termVocabularyTable = PPdbContext.Terms_Vocabulary;
                                            terms.term_value = _document[k];
                                        }
                                        pp_article.Terms_Vocabulary.Add(terms);
                                    }
                                    PPdbContext.SaveChanges();
                                }
                            }
                            catch (Exception ex)
                            {
                                File.WriteAllText(@"F:\\Magistry files\PP_crawler_Log.txt", ex.ToString());
                            }
                        }
                        else
                        {
                            File.WriteAllText(@"F:\\Magistry files\PP_crawler_Log.txt", "Empty line detected." + '\n');
                        }
                    }
                    else if (PP_separatedContent.Length == 2 && (PP_separatedContent[0].Contains("Liczba odnalezionych") || PP_separatedContent[0] == "Liczba odnalezionych rekordow"))
                    {
                        PP_articles_Count  = Convert.ToInt32(PP_separatedContent[1]);
                        PP_articles_Matrix = new string[PP_articles_Count];
                        for (int l = 0; l <= PP_articles_Count - 1; l++)
                        {
                            PP_articles_Matrix[l] = (l + 1) + ".";
                        }
                    }
                    if (PP_separatedContent.Length == 2 && (PP_separatedContent[0].ToLower().Contains("autor") || PP_separatedContent[0].Contains("Autor") || PP_separatedContent[0] == "Autor"))
                    {
                        PP_author_line = PP_separatedContent[1];
                        var PP_author_line_modified = PP_author_line.Replace("(", String.Empty);
                        PP_autors = PP_separatedContent[1].Split(autor_separators, StringSplitOptions.RemoveEmptyEntries);
                    }
                    else if (PP_separatedContent.Length == 2 && (PP_separatedContent[0].ToLower().Contains("tytu") || PP_separatedContent[0].ToLower().Contains("tytul") || PP_separatedContent[0].Contains("Tytul")))
                    {
                        PP_Tytul = PP_separatedContent[1];
                    }
                    else if (PP_separatedContent.Length == 2 && (PP_separatedContent[0].Contains("Zrodlo") || PP_separatedContent[0].ToLower().Contains("zrodlo")))
                    {
                        PP_Zrodlo = PP_separatedContent[1];
                    }
                    else if (PP_separatedContent.Length == 2 && (PP_separatedContent[0].Contains("Rok") || PP_separatedContent[0].ToLower().Contains("rok")))
                    {
                        string rok = "";
                        if (PP_separatedContent[1] != "" | PP_separatedContent[1] == String.Empty)
                        {
                            rok = null;
                        }
                        else
                        {
                            rok = PP_separatedContent[1].Substring(0, 5);
                        }

                        PP_Rok = Convert.ToInt32(rok);
                    }
                    else if (PP_separatedContent.Length == 2 && (PP_separatedContent[0].Contains("Jezyk publikacji") || PP_separatedContent[0].ToLower().Contains("jezyk publikacji") || PP_separatedContent[0].Contains("Język publikacji") || PP_separatedContent[0].ToLower().Contains("język publikacji")))
                    {
                        PP_Jezyk_Publikacji = PP_separatedContent[1];
                    }
                    else if (PP_separatedContent.Length == 2 && (PP_separatedContent[0].Contains("DOI") || PP_separatedContent[0].ToLower().Contains("doi") || PP_separatedContent[0] == "DOI"))
                    {
                        PP_DOI = PP_separatedContent[1];
                    }
                    p++;
                }
                #region Old_code

                /* 22.08.2018 - old version
                 * for (int i = 0; i <= hapDoc.DocumentNode.InnerText.Length; i++)
                 * {
                 *  PP_line = sr.ReadLine();
                 *  int counter = 0;
                 *  if (PP_line != null)
                 *  {
                 *      PP_newcontent[i] = PP_line;
                 *      PP_separatedContent = PP_line.Split(line_separator,2);
                 *
                 *
                 *      if (PP_separatedContent.Length == 2 && (PP_separatedContent[0].ToLower().Contains("autor") || PP_separatedContent[0].Contains("Autor") || PP_separatedContent[0] == "Autor"))
                 *      {
                 *          //System.Windows.MessageBox.Show(PP_separatedContent[1]);
                 *          PP_author_line = PP_separatedContent[1];
                 *          var PP_author_line_modified = PP_author_line.Replace("(", String.Empty);
                 *
                 *          PP_autors = PP_separatedContent[1].Split(autor_separators, StringSplitOptions.RemoveEmptyEntries);
                 *
                 *      }
                 *      else if (PP_separatedContent.Length == 2 && (PP_separatedContent[0].Contains("Liczba odnalezionych") || PP_separatedContent[0] == "Liczba odnalezionych rekordow"))
                 *      {
                 *          PP_articles_Count = Convert.ToInt32(PP_separatedContent[1]);
                 *          PP_articles_Matrix = new string[PP_articles_Count];
                 *          for (int l = 0; l <= PP_articles_Count - 1; l++)
                 *          {
                 *              PP_articles_Matrix[l] = (l + 1) + ".";
                 *          }
                 *      }
                 *      else if (PP_separatedContent.Length == 1 && PP_articles_Matrix.Any(x => PP_separatedContent[0].Contains(x)))
                 *      {
                 *          if (PP_author_line != null && PP_Tytul != null)
                 *          {
                 *              ///<summary>
                 *              ///PPArticle_Entity_Object_creation_Model_first
                 *              /// </summary>
                 *              try
                 *              {
                 #region PP_Article_Object_creation_Model_First
                 *                  using (var PPdbContext = new ArticleDBDataModelContainer())
                 *                  {
                 *                      var document = new StringBuilder();
                 *                      var pp_article = PPdbContext.PP_ArticlesSet.Create();
                 *
                 *                      pp_article.article_author_line = PP_author_line;
                 *                      PP_author_line = null;
                 *
                 *                      pp_article.article_title = PP_Tytul;
                 *                      if (PP_Tytul != String.Empty || PP_Tytul != " " || PP_Tytul != null)
                 *                      {
                 *                          var termTitlePP = TextPreparing.TermsPrepataions(PP_Tytul);
                 *                          document.Append(termTitlePP);
                 *                      }
                 *                      PP_Tytul = null;
                 *
                 *                      pp_article.article_source = PP_Zrodlo;
                 *                      if (PP_Zrodlo != String.Empty || PP_Zrodlo != " " || PP_Zrodlo != null)
                 *                      {
                 *                          var termSourcePP = TextPreparing.TermsPrepataions(PP_Zrodlo);
                 *                          document.Append(termSourcePP);
                 *                      }
                 *                      PP_Zrodlo = null;
                 *
                 *                      pp_article.article_year = PP_Rok;
                 *                      PP_Rok = 0;
                 *                      pp_article.article_language = PP_Jezyk_Publikacji;
                 *                      PP_Jezyk_Publikacji = null;
                 *                      pp_article.article_DOI = PP_DOI;
                 *                      PP_DOI = null;
                 *                      //
                 *                      pp_article.article_details = PP_Uwagi;
                 *                      PP_Uwagi = null;
                 *                      pp_article.article_URL = PP_Adres_URL;
                 *                      PP_Adres_URL = null;
                 *                      //
                 *
                 * for (int z = 0; z <= PP_autors.Length - 4;)
                 *                      {
                 *                          var authors_of_the_PP_article = PPdbContext.AuthorSet.Create();
                 *                          if (PP_autors[z] != "IC)")
                 *                          {
                 *                              authors_of_the_PP_article.author_name = PP_autors[z + 1];
                 *                              authors_of_the_PP_article.author_surename = PP_autors[z];
                 *                              pp_article.Author.Add(authors_of_the_PP_article);
                 *                          }
                 *                          z += 4;
                 *                      }
                 *                      PPdbContext.PP_ArticlesSet.Add(pp_article);
                 *
                 *                      var _document = document.ToString().Split(' ', ';', ':', ',');
                 *                      for (int k = 0; k <= _document.Length - 1; k++)
                 *                      {
                 *                          var terms = PPdbContext.Terms_Vocabulary.Create();
                 *
                 *                          //
                 *                          string dictionary_text = File.ReadAllText(@"F:\Magistry files\csv_files\Allowed_term_dictionary.csv");
                 *                          string[] allowed_dictionary = dictionary_text.Split(',', '\n');
                 *
                 *                          for (int p = 0; p <= _document.Length - 1; p++)
                 *                          {
                 *                              for (int j = 0; j <= allowed_dictionary.Length - 1; j++)
                 *                              {
                 *                                  if (_document[p].Length > 3 && _document[p].Contains(allowed_dictionary[j]))
                 *                                  {
                 *                                      continue;
                 *                                  }
                 *                                  else if (_document[p].Length <= 3 && !(_document[p].Contains(allowed_dictionary[j])))
                 *                                  {
                 *                                      _document.ToList().RemoveAt(p);
                 *                                  }
                 *
                 *                              }
                 *                          }
                 *                          //tutaj potrzebnie przepisac id dokumenta w ktorym wystepuje dane slowo
                 *                          if (_document[k] != String.Empty || _document[k] != " " || _document[k] != null || _document[k] != Char.IsDigit(' ').ToString())
                 *                          {
                 *                              //dbContext.Terms_Vocabulary.Where(u)
                 *                              var termVocabularyTable = PPdbContext.Terms_Vocabulary;
                 *                              terms.term_value = _document[k];
                 *
                 *                          }
                 *                          pp_article.Terms_Vocabulary.Add(terms);
                 *                      }
                 *
                 *                      PPdbContext.SaveChanges();
                 *                  }
                 #endregion
                 *              }
                 *              catch (Exception ex)
                 *              {
                 *                  File.WriteAllText(@"F:\\Magistry files\PP_crawler_Log.txt", ex.ToString());
                 *              }
                 *              ///<summary>
                 *              /// PPArticle_Entity_Object_Creation
                 *              /// </summary>
                 #region PPArticle_Entity_Object_Creation
                 *              /*
                 *              using (var dbppcontext = new PublicationsContext())
                 *              {
                 *                  var pp_article = new PPArticle();
                 *                  pp_article.article_author_line = PP_author_line;
                 *                  PP_author_line = null;
                 *                  pp_article.article_title = PP_Tytul;
                 *                  PP_Tytul = null;
                 *                  pp_article.article_source = PP_Zrodlo;
                 *                  PP_Zrodlo = null;
                 *                  pp_article.article_year = PP_Rok;
                 *                  PP_Rok = 0;
                 *                  pp_article.article_language = PP_Jezyk_Publikacji;
                 *                  PP_Jezyk_Publikacji = null;
                 *                  pp_article.article_DOI = PP_DOI;
                 *                  PP_DOI = null;
                 *                  pp_article.article_details = PP_Uwagi;
                 *                  PP_Uwagi = null;
                 *                  pp_article.article_URL = PP_Adres_URL;
                 *                  PP_Adres_URL = null;
                 *
                 *
                 *
                 *                  var authors_of_the_article = new Authors();
                 *                  for (int k = 0; k <= PP_autors.Length - 2; k++)
                 *                  {
                 *                      authors_of_the_article.author_name = PP_autors[k];
                 *                      authors_of_the_article.author_surename = PP_autors[k + 1];
                 *                      dbppcontext.Authors.Add(authors_of_the_article);
                 *
                 *                  }
                 *                  //dbppcontext.PP_Articles.Add(pp_article);
                 *                  dbppcontext.PP_Articles.Attach(pp_article);
                 *                  dbppcontext.Entry(pp_article).State = System.Data.Entity.EntityState.Added;
                 *                  dbppcontext.SaveChanges();
                 *                  //dbppcontext.SaveChanges();
                 *              }
                 *              //
                 *              //#endregion
                 *          }
                 *          else
                 *          {
                 *              //System.Windows.MessageBox.Show("Brak danych");
                 *          }
                 *
                 *      }
                 *      else if (PP_separatedContent.Length == 2 && (PP_separatedContent[0].ToLower().Contains("tytu") || PP_separatedContent[0].ToLower().Contains("tytul") || PP_separatedContent[0].Contains("Tytul")))
                 *      {
                 *          PP_Tytul = PP_separatedContent[1];
                 *          //System.Windows.MessageBox.Show(PP_Tytul);
                 *      }
                 *      else if (PP_separatedContent.Length == 2 && (PP_separatedContent[0].Contains("Zrodlo") || PP_separatedContent[0].ToLower().Contains("zrodlo")))
                 *      {
                 *          PP_Zrodlo = PP_separatedContent[1];
                 *          //System.Windows.MessageBox.Show(PP_Zrodlo);
                 *      }
                 *      else if (PP_separatedContent.Length == 2 && (PP_separatedContent[0].Contains("Rok") || PP_separatedContent[0].ToLower().Contains("rok")))
                 *      {
                 *          var rok = PP_separatedContent[1].Substring(0, 5);
                 *          PP_Rok = Convert.ToInt32(rok);
                 *          //System.Windows.MessageBox.Show(PP_Rok.ToString());
                 *      }
                 *      else if (PP_separatedContent.Length == 2 && (PP_separatedContent[0].Contains("Jezyk publikacji") || PP_separatedContent[0].ToLower().Contains("jezyk publikacji") || PP_separatedContent[0].Contains("Język publikacji") || PP_separatedContent[0].ToLower().Contains("język publikacji")))
                 *      {
                 *          PP_Jezyk_Publikacji = PP_separatedContent[1];
                 *          //System.Windows.MessageBox.Show(PP_Jezyk_Publikacji);
                 *      }
                 *      else if (PP_separatedContent.Length == 2 && (PP_separatedContent[0].Contains("DOI") || PP_separatedContent[0].ToLower().Contains("doi") || PP_separatedContent[0] == "DOI"))
                 *      {
                 *          PP_DOI = PP_separatedContent[1];
                 *          //System.Windows.MessageBox.Show(PP_DOI);
                 *      }
                 *      /*
                 *      else if (PP_separatedContent.Length == 2 && (PP_separatedContent[0].Contains("Uwagi") || PP_separatedContent[0].ToLower().Contains("uwagi") || PP_separatedContent[0] == "Uwagi"))
                 *      {
                 *          PP_Uwagi = PP_separatedContent[1];
                 *          System.Windows.MessageBox.Show(PP_Uwagi);
                 *      }
                 *      else if (PP_separatedContent.Length == 2 && (PP_separatedContent[0].Contains("Adres url") || PP_separatedContent[0].ToLower().Contains("adres url") || PP_separatedContent[0] == "Adres url"))
                 *      {
                 *          PP_Adres_URL = PP_separatedContent[1];
                 *          System.Windows.MessageBox.Show(PP_Adres_URL = PP_separatedContent[1]);
                 *      }
                 *      //
                 *
                 *      //else if (PP_separatedContent.Length == 1 && PP_separatedContent[0] == String.Empty) System.Windows.MessageBox.Show("The empty line detected", "Empty line", System.Windows.MessageBoxButton.OK);
                 *      else
                 *      {
                 *          //System.Windows.MessageBox.Show("Error! Content not found!", "Error!", System.Windows.MessageBoxButton.OK);
                 *
                 *      }
                 *      counter++;
                 *  }
                 * }
                 */
                #endregion
            }
        }
Пример #3
0
        //potrzebnie zaimplementowac divide and conquer dla duzych plikow

        public static void get_WSB_Document_content()
        {
            string[] WSB_newcontent       = new string[hapDoc.DocumentNode.InnerText.Length];
            string[] WSB_separatedContent = new string[hapDoc.DocumentNode.InnerText.Length];

            WSB_articles_Count = 0;
            string[] WSB_articles_Matrix = { String.Empty };

            using (StringReader sr = new StringReader(endText))
            {
                int    p = 0;
                string WSB_line;
                // 22.08.2018 New version of reader
                while ((WSB_line = sr.ReadLine()) != null)
                {
                    WSB_newcontent[p]    = WSB_line;
                    WSB_separatedContent = WSB_line.Split(line_separator, 2);
                    if (WSB_separatedContent.Length == 1 & WSB_separatedContent[0] == "")
                    {
                        continue;
                    }
                    else if (WSB_separatedContent.Length == 1 & WSB_articles_Matrix.Any(x => WSB_separatedContent[0].Contains(x)))
                    {
                        if (WSB_author_line != null & WSB_Tytul_pracy != null)
                        {
                            using (var dbContext = new ArticleDBDataModelContainer())
                            {
                                var document    = new StringBuilder();
                                var wsb_article = dbContext.WSB_ArticlesSet.Create();

                                if (WSB_author_line == null)
                                {
                                    WSB_author_line = "Not_defined";
                                }
                                wsb_article.article_authors = WSB_author_line;
                                WSB_author_line             = null;

                                if (WSB_Tytul_pracy == null)
                                {
                                    WSB_Tytul_pracy = "Not_defined";
                                }
                                wsb_article.article_title = WSB_Tytul_pracy;
                                if (WSB_Tytul_pracy != String.Empty | WSB_Tytul_pracy != " " | WSB_Tytul_pracy != null)
                                {
                                    var termTitle_WSB = TextPreparing.TermsPrepataions(WSB_Tytul_pracy);
                                    document.Append(termTitle_WSB);
                                }
                                WSB_Tytul_pracy = null;

                                if (WSB_Adres_wydawniczy == null)
                                {
                                    WSB_Adres_wydawniczy = "Not_defined";
                                }
                                wsb_article.article_publisher_adres = WSB_Adres_wydawniczy;
                                WSB_Adres_wydawniczy = null;

                                if (WSB_Tytul_calosci == null)
                                {
                                    WSB_Tytul_calosci = "Not_defined";
                                }
                                wsb_article.article_common_title = WSB_Tytul_calosci;
                                if (WSB_Tytul_calosci != String.Empty | WSB_Tytul_calosci != " " | WSB_Tytul_calosci != null)
                                {
                                    var termFullTitle_WSB = TextPreparing.TermsPrepataions(WSB_Tytul_calosci);
                                    document.Append(termFullTitle_WSB);
                                }
                                WSB_Tytul_calosci = null;

                                if (WSB_Slowa_kluczowe_j_pl_line == null)
                                {
                                    WSB_Slowa_kluczowe_j_pl_line = "Not_defined";
                                }
                                wsb_article.article_pl_keywords = WSB_Slowa_kluczowe_j_pl_line;
                                if (WSB_Slowa_kluczowe_j_pl_line != String.Empty | WSB_Slowa_kluczowe_j_pl_line != " " | WSB_Slowa_kluczowe_j_pl_line != null)
                                {
                                    var term_PL_Keywords_WSB = TextPreparing.TermsPrepataions(WSB_Slowa_kluczowe_j_pl_line);
                                    document.Append(term_PL_Keywords_WSB);
                                }
                                WSB_Slowa_kluczowe_j_pl_line = null;

                                if (WSB_Slowa_kluczowe_j_ang_line == null)
                                {
                                    WSB_Slowa_kluczowe_j_ang_line = "Not_defined";
                                }
                                wsb_article.article_eng_keywords = WSB_Slowa_kluczowe_j_ang_line;
                                if (WSB_Slowa_kluczowe_j_ang_line != String.Empty | WSB_Slowa_kluczowe_j_ang_line != " " | WSB_Slowa_kluczowe_j_ang_line != null)
                                {
                                    var term_Eng_Keywords_WSB = TextPreparing.TermsPrepataions(WSB_Slowa_kluczowe_j_ang_line);
                                    document.Append(term_Eng_Keywords_WSB);
                                }
                                WSB_Slowa_kluczowe_j_ang_line = null;

                                if (WSB_Tytul_pracy_w_innym_j == null)
                                {
                                    WSB_Tytul_pracy_w_innym_j = "Not_defined";
                                }
                                wsb_article.article_title_other_lang = WSB_Tytul_pracy_w_innym_j;
                                if (WSB_Tytul_pracy_w_innym_j != String.Empty | WSB_Tytul_pracy_w_innym_j != " " | WSB_Tytul_pracy_w_innym_j != null)
                                {
                                    var term_Title_Other_Lang_WSB = TextPreparing.TermsPrepataions(WSB_Tytul_pracy_w_innym_j);
                                    document.Append(term_Title_Other_Lang_WSB);
                                }
                                WSB_Tytul_pracy_w_innym_j = null;

                                if (WSB_Szczegoly == null)
                                {
                                    WSB_Szczegoly = "Not_defined";
                                }
                                wsb_article.article_details = WSB_Szczegoly;
                                WSB_Szczegoly = null;

                                if (WSB_URL == null)
                                {
                                    WSB_URL = "Not_defined";
                                }
                                wsb_article.article_URL = WSB_URL;
                                WSB_URL = null;

                                if (WSB_DOI == null)
                                {
                                    WSB_DOI = "Not_defined";
                                }
                                wsb_article.article_DOI = WSB_DOI;
                                WSB_DOI = null;
                                for (int k = 0; k <= WSB_autors.Length - 2;)
                                {
                                    var authors_of_the_article = dbContext.AuthorSet.Create();
                                    authors_of_the_article.author_name     = WSB_autors[k];
                                    authors_of_the_article.author_surename = WSB_autors[k + 1];
                                    wsb_article.Author.Add(authors_of_the_article);
                                    k += 2;
                                }
                                dbContext.WSB_ArticlesSet.Add(wsb_article);

                                var _document = document.ToString().Split(' ', ';', ':', ',');
                                for (int k = 0; k <= _document.Length - 1; k++)
                                {
                                    var      terms              = dbContext.Terms_Vocabulary.Create();
                                    string   dictionary_text    = File.ReadAllText(@"F:\Magistry files\csv_files\Allowed_term_dictionary.csv");
                                    string[] allowed_dictionary = dictionary_text.Split(',', '\n');

                                    for (int d = 0; d <= _document.Length - 1; d++)
                                    {
                                        for (int j = 0; j <= allowed_dictionary.Length - 1; j++)
                                        {
                                            if (_document[d].Length > 3 & _document[d].Contains(allowed_dictionary[j]))
                                            {
                                                continue;
                                            }
                                            else if (_document[d].Length <= 3 & !(_document[d].Contains(allowed_dictionary[j])))
                                            {
                                                _document.ToList().RemoveAt(d);
                                            }
                                        }
                                    }

                                    //tutaj potrzebnie przepisac id dokumenta w ktorym wystepuje dane slowo
                                    if (_document[k] != String.Empty | _document[k] != " " | _document[k] != null | _document[k] != Char.IsDigit(' ').ToString())
                                    {
                                        //dbContext.Terms_Vocabulary.Where(u)
                                        var termVocabularyTable = dbContext.Terms_Vocabulary;
                                        terms.term_value = _document[k];
                                    }
                                    wsb_article.Terms_Vocabulary.Add(terms);
                                }
                                try
                                {
                                    dbContext.SaveChanges();
                                }
                                catch (Exception ex)
                                {
                                    File.WriteAllText(@"F:\\Magistry files\WSB_crawler_Log.txt", ex.ToString());
                                }
                            }
                        }
                        else
                        {
                            continue;
                        }
                    }
                    else if (WSB_separatedContent.Length == 2 & (WSB_separatedContent[0].ToLower().Contains("autor") | WSB_separatedContent[0].Contains("Autor") | WSB_separatedContent[0] == "Autorzy"))
                    {
                        WSB_autors      = WSB_separatedContent[1].Split(autor_separators, StringSplitOptions.RemoveEmptyEntries);
                        WSB_author_line = WSB_separatedContent[1];
                    }
                    else if (WSB_separatedContent.Length == 2 & (WSB_separatedContent[0].ToLower().Contains("tytul pracy") | WSB_separatedContent[0].Contains("Tytul pracy") | WSB_separatedContent[0] == "Tytul pracy"))
                    {
                        WSB_Tytul_pracy = WSB_separatedContent[1];
                    }
                    else if (WSB_separatedContent.Length == 2 & (WSB_separatedContent[0].Contains("Liczba odnalezionych") | WSB_separatedContent[0] == "Liczba odnalezionych rekordow"))
                    {
                        WSB_articles_Count  = Convert.ToInt32(WSB_separatedContent[1]);
                        WSB_articles_Matrix = new string[WSB_articles_Count];
                        for (int z = 0; z <= WSB_articles_Count - 1; z++)
                        {
                            WSB_articles_Matrix[z] = (z + 1) + ".";
                        }
                    }
                    else if (WSB_separatedContent.Length == 2 & (WSB_separatedContent[0].ToLower().Contains("adres wydawniczy") | WSB_separatedContent[0].Contains("Adres wydawniczy") | WSB_separatedContent[0] == "Adres wydawniczy"))
                    {
                        WSB_Adres_wydawniczy = WSB_separatedContent[1];
                    }
                    else if (WSB_separatedContent.Length == 2 & (WSB_separatedContent[0].ToLower().Contains("polskie hasla") | WSB_separatedContent[0].Contains("Polskie hasla") | WSB_separatedContent[0] == "Polskie hasla przedmiotowe"))
                    {
                        WSB_Slowa_kluczowe_j_pl      = WSB_separatedContent[1].Split(separators, StringSplitOptions.RemoveEmptyEntries);
                        WSB_Slowa_kluczowe_j_pl_line = WSB_separatedContent[1];
                    }
                    else if (WSB_separatedContent.Length == 2 & (WSB_separatedContent[0].ToLower().Contains("angielskie hasla") | WSB_separatedContent[0].Contains("Angielskie hasla") | WSB_separatedContent[0] == "Angielskie hasla przedmiotowe"))
                    {
                        WSB_Slowa_kluczowe_j_ang      = WSB_separatedContent[1].Split(separators, StringSplitOptions.RemoveEmptyEntries);
                        WSB_Slowa_kluczowe_j_ang_line = WSB_separatedContent[1];
                    }
                    else if (WSB_separatedContent.Length == 2 & (WSB_separatedContent[0].ToLower().Contains("tytul calosci") | WSB_separatedContent[0].Contains("Tytul calosci") | WSB_separatedContent[0] == "Tytul calosci"))
                    {
                        WSB_Tytul_calosci = WSB_separatedContent[1];
                    }
                    else if (WSB_separatedContent.Length == 2 & (WSB_separatedContent[0].ToLower().Contains("doi") | WSB_separatedContent[0].Contains("DOI") | WSB_separatedContent[0] == "DOI"))
                    {
                        WSB_DOI = WSB_separatedContent[1];
                    }
                    else if (WSB_separatedContent.Length == 2 & (WSB_separatedContent[0].ToLower().Contains("tytul pracy w innym") | WSB_separatedContent[0].Contains("Tytul pracy w innym") | WSB_separatedContent[0] == "Tytul pracy w innym jezyku"))
                    {
                        WSB_Tytul_pracy_w_innym_j = WSB_separatedContent[1];
                    }
                    else if (WSB_separatedContent.Length == 2 & (WSB_separatedContent[0].ToLower().Contains("szczegoly") | WSB_separatedContent[0].Contains("Szczegoly") | WSB_separatedContent[0] == "Szczegoly"))
                    {
                        WSB_Szczegoly = WSB_separatedContent[1];
                    }
                    else if (WSB_separatedContent.Length == 2 & (WSB_separatedContent[0].ToLower().Contains("url") | WSB_separatedContent[0].Contains("Url") | WSB_separatedContent[0] == "Adres url"))
                    {
                        WSB_URL = WSB_separatedContent[1];
                    }
                    p++;
                }

                #region Old_iteration_method

                /* -- 21.08.2018 Old wersion of iteration
                 * for (int i = 0; i <= hapDoc.DocumentNode.InnerText.Length; i++)
                 * {
                 *  WSB_line = sr.ReadLine();
                 *  if (WSB_line != null)
                 *  {
                 *      WSB_newcontent[i] = WSB_line;
                 *      WSB_separatedContent = WSB_line.Split(line_separator, 2);
                 *
                 *
                 *      if (WSB_separatedContent.Length == 2 & (WSB_separatedContent[0].ToLower().Contains("autor") | WSB_separatedContent[0].Contains("Autor") | WSB_separatedContent[0] == "Autorzy"))
                 *      {
                 *          //System.Windows.MessageBox.Show(WSB_separatedContent[1]);
                 *          WSB_autors = WSB_separatedContent[1].Split(autor_separators, StringSplitOptions.RemoveEmptyEntries);
                 *          WSB_author_line = WSB_separatedContent[1];
                 *      }
                 *
                 *      else if (WSB_separatedContent.Length == 2 & (WSB_separatedContent[0].Contains("Liczba odnalezionych") | WSB_separatedContent[0] == "Liczba odnalezionych rekordow"))
                 *      {
                 *          WSB_articles_Count = Convert.ToInt32(WSB_separatedContent[1]);
                 *          WSB_articles_Matrix = new string[WSB_articles_Count];
                 *          for (int z = 0; z <= WSB_articles_Count - 1; z++)
                 *          {
                 *              WSB_articles_Matrix[z] = (z + 1) + ".";
                 *          }
                 *      }
                 *
                 *      else if (WSB_separatedContent.Length == 1 & WSB_articles_Matrix.Any(x => WSB_separatedContent[0].Contains(x)))
                 *      {
                 *          if (WSB_author_line != null & WSB_Tytul_pracy != null)
                 *          {
                 *              using(var dbContext = new ArticleDBDataModelContainer())
                 *              {
                 *                  var document = new StringBuilder();
                 *                  var wsb_article = dbContext.WSB_ArticlesSet.Create();
                 *
                 *                  if (WSB_author_line == null)
                 *                  {
                 *                      WSB_author_line = "Not_defined";
                 *                  }
                 *                  wsb_article.article_authors = WSB_author_line;
                 *                  WSB_author_line = null;
                 *
                 *                  if (WSB_Tytul_pracy == null)
                 *                  {
                 *                      WSB_Tytul_pracy = "Not_defined";
                 *                  }
                 *                  wsb_article.article_title = WSB_Tytul_pracy;
                 *                  if (WSB_Tytul_pracy != String.Empty | WSB_Tytul_pracy != " " | WSB_Tytul_pracy != null)
                 *                  {
                 *                      var termTitle_WSB = TextPreparing.TermsPrepataions(WSB_Tytul_pracy);
                 *                      document.Append(termTitle_WSB);
                 *                  }
                 *                  WSB_Tytul_pracy = null;
                 *
                 *                  if (WSB_Adres_wydawniczy == null)
                 *                  {
                 *                      WSB_Adres_wydawniczy = "Not_defined";
                 *                  }
                 *                  wsb_article.article_publisher_adres = WSB_Adres_wydawniczy;
                 *                  WSB_Adres_wydawniczy = null;
                 *
                 *                  if (WSB_Tytul_calosci == null)
                 *                  {
                 *                      WSB_Tytul_calosci = "Not_defined";
                 *                  }
                 *                  wsb_article.article_common_title = WSB_Tytul_calosci;
                 *                  if (WSB_Tytul_calosci != String.Empty | WSB_Tytul_calosci != " " | WSB_Tytul_calosci != null)
                 *                  {
                 *                      var termFullTitle_WSB = TextPreparing.TermsPrepataions(WSB_Tytul_calosci);
                 *                      document.Append(termFullTitle_WSB);
                 *                  }
                 *                  WSB_Tytul_calosci = null;
                 *
                 *                  if (WSB_Slowa_kluczowe_j_pl_line == null)
                 *                  {
                 *                      WSB_Slowa_kluczowe_j_pl_line = "Not_defined";
                 *                  }
                 *                  wsb_article.article_pl_keywords = WSB_Slowa_kluczowe_j_pl_line;
                 *                  if (WSB_Slowa_kluczowe_j_pl_line != String.Empty | WSB_Slowa_kluczowe_j_pl_line != " " | WSB_Slowa_kluczowe_j_pl_line != null)
                 *                  {
                 *                      var term_PL_Keywords_WSB = TextPreparing.TermsPrepataions(WSB_Slowa_kluczowe_j_pl_line);
                 *                      document.Append(term_PL_Keywords_WSB);
                 *                  }
                 *                  WSB_Slowa_kluczowe_j_pl_line = null;
                 *
                 *                  if (WSB_Slowa_kluczowe_j_ang_line == null)
                 *                  {
                 *                      WSB_Slowa_kluczowe_j_ang_line = "Not_defined";
                 *                  }
                 *                  wsb_article.article_eng_keywords = WSB_Slowa_kluczowe_j_ang_line;
                 *                  if (WSB_Slowa_kluczowe_j_ang_line != String.Empty | WSB_Slowa_kluczowe_j_ang_line != " " | WSB_Slowa_kluczowe_j_ang_line != null)
                 *                  {
                 *                      var term_Eng_Keywords_WSB = TextPreparing.TermsPrepataions(WSB_Slowa_kluczowe_j_ang_line);
                 *                      document.Append(term_Eng_Keywords_WSB);
                 *                  }
                 *                  WSB_Slowa_kluczowe_j_ang_line = null;
                 *
                 *                  if (WSB_Tytul_pracy_w_innym_j == null)
                 *                  {
                 *                      WSB_Tytul_pracy_w_innym_j = "Not_defined";
                 *                  }
                 *                  wsb_article.article_title_other_lang = WSB_Tytul_pracy_w_innym_j;
                 *                  if (WSB_Tytul_pracy_w_innym_j != String.Empty | WSB_Tytul_pracy_w_innym_j != " " | WSB_Tytul_pracy_w_innym_j != null)
                 *                  {
                 *                      var term_Title_Other_Lang_WSB = TextPreparing.TermsPrepataions(WSB_Tytul_pracy_w_innym_j);
                 *                      document.Append(term_Title_Other_Lang_WSB);
                 *                  }
                 *                  WSB_Tytul_pracy_w_innym_j = null;
                 *
                 *                  if (WSB_Szczegoly == null)
                 *                  {
                 *                      WSB_Szczegoly = "Not_defined";
                 *                  }
                 *                  wsb_article.article_details = WSB_Szczegoly;
                 *                  WSB_Szczegoly = null;
                 *
                 *                  if (WSB_URL == null)
                 *                  {
                 *                      WSB_URL = "Not_defined";
                 *                  }
                 *                  wsb_article.article_URL = WSB_URL;
                 *                  WSB_URL = null;
                 *
                 *                  if (WSB_DOI == null)
                 *                  {
                 *                      WSB_DOI = "Not_defined";
                 *                  }
                 *                  wsb_article.article_DOI = WSB_DOI;
                 *                  WSB_DOI = null;
                 *
                 *
                 *                  for (int k = 0; k <= WSB_autors.Length - 2;)
                 *                  {
                 *                      var authors_of_the_article = dbContext.AuthorSet.Create();
                 *                      authors_of_the_article.author_name = WSB_autors[k];
                 *                      authors_of_the_article.author_surename = WSB_autors[k + 1];
                 *                      wsb_article.Author.Add(authors_of_the_article);
                 *                      k += 2;
                 *                  }
                 *
                 *                   dbContext.WSB_ArticlesSet.Add(wsb_article);
                 *
                 *                  var _document = document.ToString().Split(' ', ';', ':', ',');
                 *                  for (int k = 0; k <= _document.Length - 1; k++)
                 *                  {
                 *                      var terms = dbContext.Terms_Vocabulary.Create();
                 *                      //
                 *                      string dictionary_text = File.ReadAllText(@"F:\Magistry files\csv_files\Allowed_term_dictionary.csv");
                 *                      string[] allowed_dictionary = dictionary_text.Split(',', '\n');
                 *
                 *                      for (int p = 0; p <= _document.Length - 1; p++)
                 *                      {
                 *                          for (int j = 0; j <= allowed_dictionary.Length - 1; j++)
                 *                          {
                 *                              if (_document[p].Length > 3 & _document[p].Contains(allowed_dictionary[j]))
                 *                              {
                 *                                  continue;
                 *                              }
                 *                              else if (_document[p].Length <= 3 & !(_document[p].Contains(allowed_dictionary[j])))
                 *                              {
                 *                                  _document.ToList().RemoveAt(p);
                 *                              }
                 *
                 *                          }
                 *                      }
                 *
                 *                      //tutaj potrzebnie przepisac id dokumenta w ktorym wystepuje dane slowo
                 *                      if (_document[k] != String.Empty | _document[k] != " " | _document[k] != null | _document[k] != Char.IsDigit(' ').ToString())
                 *                      {
                 *                          //dbContext.Terms_Vocabulary.Where(u)
                 *                          var termVocabularyTable = dbContext.Terms_Vocabulary;
                 *                          terms.term_value = _document[k];
                 *
                 *                      }
                 *                      wsb_article.Terms_Vocabulary.Add(terms);
                 *                  }
                 *                  try
                 *                  {
                 *                      dbContext.SaveChanges();
                 *                  }
                 *                  catch (Exception ex)
                 *                  {
                 *                      File.WriteAllText(@"F:\\Magistry files\WSB_crawler_Log.txt", ex.ToString());
                 *                  }
                 *
                 *              }
                 *          }
                 *
                 *          else
                 *          {
                 *              //return;
                 *              //System.Windows.MessageBox.Show("brak danych!");
                 *              //File.WriteAllText(@"F:\\Magistry files\WSB_emptyLines.txt", "empty_line");
                 *              continue;
                 *          }
                 *
                 *      }
                 *
                 *      else if (WSB_separatedContent.Length == 2 & (WSB_separatedContent[0].ToLower().Contains("tytul pracy") | WSB_separatedContent[0].Contains("Tytul pracy") | WSB_separatedContent[0] == "Tytul pracy"))
                 *      {
                 *          //System.Windows.MessageBox.Show(WSB_separatedContent[1]);
                 *          WSB_Tytul_pracy = WSB_separatedContent[1];
                 *      }
                 *
                 *      else if (WSB_separatedContent.Length == 2 & (WSB_separatedContent[0].ToLower().Contains("adres wydawniczy") | WSB_separatedContent[0].Contains("Adres wydawniczy") | WSB_separatedContent[0] == "Adres wydawniczy"))
                 *      {
                 *          //System.Windows.MessageBox.Show(WSB_separatedContent[1]);
                 *          WSB_Adres_wydawniczy = WSB_separatedContent[1];
                 *      }
                 *
                 *
                 *      else if (WSB_separatedContent.Length == 2 & (WSB_separatedContent[0].ToLower().Contains("polskie hasla") | WSB_separatedContent[0].Contains("Polskie hasla") | WSB_separatedContent[0] == "Polskie hasla przedmiotowe"))
                 *      {
                 *          //System.Windows.MessageBox.Show(WSB_separatedContent[1]);
                 *          WSB_Slowa_kluczowe_j_pl = WSB_separatedContent[1].Split(separators, StringSplitOptions.RemoveEmptyEntries);
                 *          WSB_Slowa_kluczowe_j_pl_line = WSB_separatedContent[1];
                 *      }
                 *
                 *      else if (WSB_separatedContent.Length == 2 & (WSB_separatedContent[0].ToLower().Contains("angielskie hasla") | WSB_separatedContent[0].Contains("Angielskie hasla") | WSB_separatedContent[0] == "Angielskie hasla przedmiotowe"))
                 *      {
                 *          //System.Windows.MessageBox.Show(WSB_separatedContent[1]);
                 *          WSB_Slowa_kluczowe_j_ang = WSB_separatedContent[1].Split(separators, StringSplitOptions.RemoveEmptyEntries);
                 *          WSB_Slowa_kluczowe_j_ang_line = WSB_separatedContent[1];
                 *      }
                 *
                 *      else if (WSB_separatedContent.Length == 2 & (WSB_separatedContent[0].ToLower().Contains("tytul calosci") | WSB_separatedContent[0].Contains("Tytul calosci") | WSB_separatedContent[0] == "Tytul calosci"))
                 *      {
                 *          //System.Windows.MessageBox.Show(WSB_separatedContent[1]);
                 *          WSB_Tytul_calosci = WSB_separatedContent[1];
                 *      }
                 *
                 *      else if (WSB_separatedContent.Length == 2 & (WSB_separatedContent[0].ToLower().Contains("doi") | WSB_separatedContent[0].Contains("DOI") | WSB_separatedContent[0] == "DOI"))
                 *      {
                 *          //System.Windows.MessageBox.Show(WSB_separatedContent[1]);
                 *          WSB_DOI = WSB_separatedContent[1];
                 *      }
                 *
                 *      else if (WSB_separatedContent.Length == 2 & (WSB_separatedContent[0].ToLower().Contains("tytul pracy w innym") | WSB_separatedContent[0].Contains("Tytul pracy w innym") | WSB_separatedContent[0] == "Tytul pracy w innym jezyku"))
                 *      {
                 *          //System.Windows.MessageBox.Show(WSB_separatedContent[1]);
                 *          WSB_Tytul_pracy_w_innym_j = WSB_separatedContent[1];
                 *      }
                 *
                 *      else if (WSB_separatedContent.Length == 2 & (WSB_separatedContent[0].ToLower().Contains("szczegoly") | WSB_separatedContent[0].Contains("Szczegoly") | WSB_separatedContent[0] == "Szczegoly"))
                 *      {
                 *         //System.Windows.MessageBox.Show(WSB_separatedContent[1]);
                 *          WSB_Szczegoly = WSB_separatedContent[1];
                 *      }
                 *
                 *      else if (WSB_separatedContent.Length == 2 & (WSB_separatedContent[0].ToLower().Contains("url") | WSB_separatedContent[0].Contains("Url") | WSB_separatedContent[0] == "Adres url"))
                 *      {
                 *          //System.Windows.MessageBox.Show(WSB_separatedContent[1]);
                 *          WSB_URL = WSB_separatedContent[1];
                 *      }
                 *
                 *      //else if (PP_separatedContent.Length == 1 & PP_separatedContent[0] == String.Empty) System.Windows.MessageBox.Show("The empty line detected", "Empty line", System.Windows.MessageBoxButton.OK);
                 *      //else System.Windows.MessageBox.Show("Error! Content not found!", "Error!", System.Windows.MessageBoxButton.OK);
                 *  }
                 * }
                 */
                #endregion
            }
        }
Пример #4
0
        public static void get_UMK_Document_content()
        {
            string[] UMK_newcontent       = new string[hapDoc.DocumentNode.InnerText.Length];
            string[] UMK_separatedContent = new string[hapDoc.DocumentNode.InnerText.Length];

            UMK_articles_Count = 0;
            string[] PP_articles_Matrix = { String.Empty };

            using (StringReader sr = new StringReader(endText))
            {
                string UMK_line;
                for (int i = 0; i <= hapDoc.DocumentNode.InnerText.Length; i++)
                {
                    UMK_line = sr.ReadLine();
                    if (UMK_line != null)
                    {
                        UMK_newcontent[i]    = UMK_line;
                        UMK_separatedContent = UMK_line.Split(line_separator, 2);
                        //tutaj idzie funkcjonalnosc
                        if (UMK_separatedContent.Length == 1 & UMK_separatedContent[0] == "")
                        {
                            continue;
                        }
                        else if (UMK_separatedContent.Length == 1 & PP_articles_Matrix.Any(x => UMK_separatedContent[0].Contains(x)))
                        {
                            if (UMK_author_line != null && UMK_Tytul != null)
                            {
                                using (var dbContext = new ArticleDBDataModelContainer())
                                {
                                    var document    = new StringBuilder();
                                    var umk_article = dbContext.UMK_ArticlesSet.Create();

                                    if (UMK_author_line == null)
                                    {
                                        UMK_author_line = "Not_defined";
                                    }
                                    umk_article.article_author_line = UMK_author_line;
                                    UMK_author_line = null;

                                    if (UMK_Tytul == null)
                                    {
                                        UMK_Tytul = "Not_defined";
                                    }
                                    umk_article.article_title = UMK_Tytul;
                                    if (UMK_Tytul != String.Empty | UMK_Tytul != " " | UMK_Tytul != null)
                                    {
                                        var termTitle_UMK = TextPreparing.TermsPrepataions(UMK_Tytul);
                                        document.Append(termTitle_UMK);
                                    }
                                    UMK_Tytul = null;

                                    if (UMK_Pelny_tytul_czasop == null)
                                    {
                                        UMK_Pelny_tytul_czasop = "Not_defined";
                                    }
                                    umk_article.article_Full_title = UMK_Pelny_tytul_czasop;
                                    if (UMK_Pelny_tytul_czasop != String.Empty | UMK_Pelny_tytul_czasop != " " | UMK_Pelny_tytul_czasop != null)
                                    {
                                        var termFullTitle_UMK = TextPreparing.TermsPrepataions(UMK_Pelny_tytul_czasop);
                                        document.Append(termFullTitle_UMK);
                                    }
                                    UMK_Pelny_tytul_czasop = null;

                                    if (UMK_Jezyk_Publikacji == null)
                                    {
                                        UMK_Jezyk_Publikacji = "Not_defined";
                                    }
                                    umk_article.article_language = UMK_Jezyk_Publikacji;
                                    UMK_Jezyk_Publikacji         = null;

                                    if (UMK_Tytul_rownolegly == null)
                                    {
                                        UMK_Tytul_rownolegly = "Not_defined";
                                    }
                                    umk_article.article_translated_title = UMK_Tytul_rownolegly;
                                    if (UMK_Tytul_rownolegly != String.Empty | UMK_Tytul_rownolegly != " " | UMK_Tytul_rownolegly != null)
                                    {
                                        var termParallelTitle_UMK = TextPreparing.TermsPrepataions(UMK_Tytul_rownolegly);
                                        document.Append(termParallelTitle_UMK);
                                    }
                                    UMK_Tytul_rownolegly = null;

                                    if (UMK_en_keywords_line == null)
                                    {
                                        UMK_en_keywords_line = "Not_defined";
                                    }
                                    umk_article.article_eng_keywords = UMK_en_keywords_line;
                                    if (UMK_en_keywords_line != String.Empty | UMK_en_keywords_line != " " | UMK_en_keywords_line != null)
                                    {
                                        var term_Eng_Keywords_UMK = TextPreparing.TermsPrepataions(UMK_en_keywords_line);
                                        document.Append(term_Eng_Keywords_UMK);
                                    }
                                    UMK_en_keywords_line = null;

                                    if (UMK_pl_keywords_line == null)
                                    {
                                        UMK_pl_keywords_line = "Not_defined";
                                    }
                                    umk_article.article_pl_keywords = UMK_pl_keywords_line;
                                    if (UMK_pl_keywords_line != String.Empty | UMK_pl_keywords_line != " " | UMK_pl_keywords_line != null)
                                    {
                                        var term_PL_Keywords_UMK = TextPreparing.TermsPrepataions(UMK_pl_keywords_line);
                                        document.Append(term_PL_Keywords_UMK);
                                    }
                                    UMK_pl_keywords_line = null;

                                    if (UMK_Adres_URL == null)
                                    {
                                        UMK_Adres_URL = "Not_defined";
                                    }
                                    umk_article.article_url = UMK_Adres_URL;
                                    UMK_Adres_URL           = null;

                                    if (UMK_Tytul_Wydawn_Zbior == null)
                                    {
                                        UMK_Tytul_Wydawn_Zbior = "Not_defined";
                                    }
                                    umk_article.article_publisher_title = UMK_Tytul_Wydawn_Zbior;
                                    UMK_Tytul_Wydawn_Zbior = null;

                                    if (UMK_Opis_wydawn == null)
                                    {
                                        UMK_Opis_wydawn = "Not_defined";
                                    }
                                    umk_article.article_publisher_desc = UMK_Opis_wydawn;
                                    UMK_Opis_wydawn = null;


                                    for (int k = 0; k <= UMK_autors.Length - 2;)
                                    {
                                        var authors_of_the_article = dbContext.AuthorSet.Create();
                                        authors_of_the_article.author_name     = UMK_autors[k];
                                        authors_of_the_article.author_surename = UMK_autors[k + 1];
                                        umk_article.Author.Add(authors_of_the_article);
                                        k += 2;
                                    }

                                    dbContext.UMK_ArticlesSet.Add(umk_article);
                                    //dbContext.Configuration.ValidateOnSaveEnabled = false;

                                    var _document = document.ToString().Split(' ', ';', ':', ',');
                                    for (int k = 0; k <= _document.Length - 1; k++)
                                    {
                                        var      terms              = dbContext.Terms_Vocabulary.Create();
                                        string   dictionary_text    = File.ReadAllText(@"F:\Magistry files\csv_files\Allowed_term_dictionary.csv");
                                        string[] allowed_dictionary = dictionary_text.Split(',', '\n');

                                        for (int p = 0; p <= _document.Length - 1; p++)
                                        {
                                            for (int j = 0; j <= allowed_dictionary.Length - 1; j++)
                                            {
                                                if (_document[p].Length > 3 && _document[p].Contains(allowed_dictionary[j]))
                                                {
                                                    continue;
                                                }
                                                else if (_document[p].Length <= 3 && !(_document[p].Contains(allowed_dictionary[j])))
                                                {
                                                    _document.ToList().RemoveAt(p);
                                                }
                                            }
                                        }

                                        //tutaj potrzebnie przepisac id dokumenta w ktorym wystepuje dane slowo
                                        if (_document[k] != String.Empty | _document[k] != " " | _document[k] != null | _document[k] != Char.IsDigit(' ').ToString())
                                        {
                                            //dbContext.Terms_Vocabulary.Where(u)
                                            var termVocabularyTable = dbContext.Terms_Vocabulary;
                                            terms.term_value = _document[k];
                                        }
                                        umk_article.Terms_Vocabulary.Add(terms);
                                    }

                                    try
                                    {
                                        dbContext.SaveChanges();
                                    }
                                    catch (Exception ex)
                                    {
                                        File.WriteAllText(@"F:\\Magistry files\UMK_crawler_Log.txt", ex.ToString());
                                    }
                                }
                            }
                            else
                            {
                                continue;
                            }
                        }
                        else if (UMK_separatedContent.Length == 1 & (UMK_separatedContent[0].ToLower().Contains("http://") | UMK_separatedContent[0].ToLower().Contains("https://") | UMK_separatedContent[0].Contains("http://") | UMK_separatedContent[0].Contains("https://")))
                        {
                            UMK_Adres_URL = UMK_separatedContent[0];
                        }
                        else if (UMK_separatedContent.Length == 2 & (UMK_separatedContent[0].ToLower().Contains("aut.") | UMK_separatedContent[0].Contains("Aut.") | UMK_separatedContent[0] == "Aut."))
                        {
                            UMK_autors      = UMK_separatedContent[1].Split(separators, StringSplitOptions.RemoveEmptyEntries);
                            UMK_author_line = UMK_separatedContent[1];
                        }
                        else if (UMK_separatedContent.Length == 2 & (UMK_separatedContent[0].ToLower().Contains("Zapyt") | UMK_separatedContent[0].Contains("zapyt") | UMK_separatedContent[0] == "Zapytanie" | UMK_separatedContent[0] == "zapytanie"))
                        {
                            UMK_Zapytanie_Wyszukiwania = "SELECT * FROM UMK_Splendor_Expertus_article_database WHERE article LIKE " + UMK_separatedContent[1];
                        }
                        else if (UMK_separatedContent.Length == 2 & (UMK_separatedContent[0].Contains("Liczba odnalezionych") | UMK_separatedContent[0] == "Liczba odnalezionych rekordow"))
                        {
                            UMK_articles_Count = Convert.ToInt32(UMK_separatedContent[1]);
                            PP_articles_Matrix = new string[UMK_articles_Count];
                            for (int z = 0; z <= UMK_articles_Count - 1; z++)
                            {
                                PP_articles_Matrix[z] = (z + 1) + ".";
                            }
                        }
                        else if (UMK_separatedContent.Length >= 2 & (UMK_separatedContent[0].ToLower().Contains("tytu") | UMK_separatedContent[0].ToLower().Contains("tytuł") | UMK_separatedContent[0].ToLower().Contains("tytul") | UMK_separatedContent[0].Contains("TYTUŁ") | UMK_separatedContent[0] == "Tytuł" | UMK_separatedContent[0] == "Tytul"))
                        {
                            UMK_Tytul = UMK_separatedContent[1];
                        }
                        else if (UMK_separatedContent.Length >= 2 & (UMK_separatedContent[0].ToLower().Contains("opis wydawn.") | UMK_separatedContent[0].ToLower().Contains("opis wydawn") | UMK_separatedContent[0].Contains("Opis wydawn.") | UMK_separatedContent[0].Contains("Opis wydawn") | UMK_separatedContent[0] == "Opis wydawn."))
                        {
                            UMK_Opis_wydawn = UMK_separatedContent[1];
                        }
                        else if (UMK_separatedContent.Length == 2 & (UMK_separatedContent[0].ToLower().Contains("język") | UMK_separatedContent[0].ToLower().Contains("jezyk") | UMK_separatedContent[0].Contains("Język") | UMK_separatedContent[0].Contains("Jezyk") | UMK_separatedContent[0] == "Język" | UMK_separatedContent[0] == "Jezyk"))
                        {
                            UMK_Jezyk_Publikacji = UMK_separatedContent[1];
                        }
                        else if (UMK_separatedContent.Length == 2 & (UMK_separatedContent[0].ToLower().Contains("polskie słowa kluczowe") | UMK_separatedContent[0].ToLower().Contains("polskie slowa kluczowe") | UMK_separatedContent[0].Contains("Polskie słowa kluczowe") | UMK_separatedContent[0].Contains("Polskie slowa kluczowe") | UMK_separatedContent[0].Contains("Polskie slo") | UMK_separatedContent[0].Contains("polskie slo") | UMK_separatedContent[0] == "Polskie słowa kluczowe" | UMK_separatedContent[0] == "Polskie slowa kluczowe"))
                        {
                            UMK_Slowa_kluczowe_j_pl = UMK_separatedContent[1].Split(separators);
                            UMK_pl_keywords_line    = UMK_separatedContent[1];
                        }
                        else if (UMK_separatedContent.Length == 2 & (UMK_separatedContent[0].ToLower().Contains("tytuł wydawn. zbior.") | UMK_separatedContent[0].ToLower().Contains("tytul wydawn. zbior.") | UMK_separatedContent[0].Contains("Tytuł wydawn. zbior.") | UMK_separatedContent[0].Contains("Tytul wydawn. zbior.") | UMK_separatedContent[0] == "Tytuł wydawn. zbior." | UMK_separatedContent[0] == "Tytul wydawn. zbior."))
                        {
                            UMK_Tytul_Wydawn_Zbior = UMK_separatedContent[1];
                        }
                        else if ((UMK_separatedContent.Length == 2 & (UMK_separatedContent[0].ToLower().Contains("pełny tytuł czasop.") | UMK_separatedContent[0].ToLower().Contains("pelny tytul czasop.") | UMK_separatedContent[0].Contains("Pełny tytuł czasop.") | UMK_separatedContent[0].Contains("Pelny tytul czasop.") | UMK_separatedContent[0] == "Pełny tytuł czasop." | UMK_separatedContent[0] == "Pelny tytul czasop.")))
                        {
                            UMK_Pelny_tytul_czasop = UMK_separatedContent[1];
                        }
                        else if ((UMK_separatedContent.Length == 2 & (UMK_separatedContent[0].ToLower().Contains("tytuł równoległy") | UMK_separatedContent[0].ToLower().Contains("Tytul rownolegly") | UMK_separatedContent[0] == "Tytuł równoległy" | UMK_separatedContent[0] == "Tytul rownolegly")))
                        {
                            UMK_Tytul_rownolegly = UMK_separatedContent[1];
                        }
                        else if (UMK_separatedContent.Length == 2 & (UMK_separatedContent[0].ToLower().Contains("angielskie słowa kluczowe") | UMK_separatedContent[0].ToLower().Contains("angielskie slowa kluczowe") | UMK_separatedContent[0].Contains("Angielskie słowa kluczowe") | UMK_separatedContent[0].Contains("angielskie słowa kluczowe ") | UMK_separatedContent[0] == "Angielskie słowa kluczowe" | UMK_separatedContent[0] == "angielskie słowa kluczowe"))
                        {
                            UMK_Slowa_kluczowe_j_ang = UMK_separatedContent[1].Split(separators);
                            UMK_en_keywords_line     = UMK_separatedContent[1];
                        }
                    }
                }
            }
        }
Пример #5
0
        public static void LoadBibtexFile()
        {
            string[] fileEntries = Directory.GetFiles(filePathBibtex);
            char[]   not_allowedCharsforArticle = { '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '<', '>', 'x', '!', '#', '$', '%', '^', '&', '*', '(', ')', '/', '\'' };
            string[] new_document = new string[0];

            foreach (string file in fileEntries)
            {
                using (StreamReader reader = new StreamReader(File.OpenRead(file)))
                {
                    if (reader.ToString() != null || !reader.ToString().Contains("title ="))
                    {
                        context          = new string[14];
                        separatedContext = new string[2];


                        for (int i = 0; i <= context.Count() - 1; i++)
                        {
                            context[i] = reader.ReadLine();
                            if (context[i] != null || context[i] == "}")
                            {
                                try
                                {
                                    Console.WriteLine("Processing " + i.ToString() + " line.");
                                    context[i]       = context[i].TrimStart(' ').Replace('\"', ' ').Replace('\\', ' ').TrimEnd(',');
                                    separatedContext = context[i].Split(separators, 2, StringSplitOptions.RemoveEmptyEntries);


                                    #region getVariables
                                    if (separatedContext[0].Contains("title"))
                                    {
                                        #region little_modification_for_title_clearing

                                        /*
                                         * for (int a = 0; a < separatedContext[1].Length; a++)
                                         * {
                                         *  for (int b = 0; b < not_allowedCharsforArticle.Length; b++)
                                         *  {
                                         *      if (separatedContext[1].ElementAt(a) == not_allowedCharsforArticle[b])
                                         *          separatedContext[1].Remove(a, 1);
                                         *  }
                                         * }
                                         */
                                        #endregion
                                        if (separatedContext[1].Length >= 2)
                                        {
                                            _title = separatedContext[1];
                                        }
                                    }
                                    else if (separatedContext[0].Contains("abstract"))
                                    {
                                        #region little_modification_for_abstract_clearing
                                        for (int a = 0; a < separatedContext[1].Length; a++)
                                        {
                                            for (int b = 0; b < not_allowedCharsforArticle.Length; b++)
                                            {
                                                if (separatedContext[1].ElementAt(a) == not_allowedCharsforArticle[b])
                                                {
                                                    separatedContext[1].Remove(a, 1);
                                                }
                                            }
                                        }
                                        #endregion
                                        if (separatedContext[1].Length >= 5)
                                        {
                                            _abstract = separatedContext[1];
                                        }
                                    }
                                    else if (separatedContext[0].Contains("keywords"))
                                    {
                                        if (separatedContext[1] != String.Empty || separatedContext[1] != " ")
                                        {
                                            _keywords = separatedContext[1];
                                        }
                                        else
                                        {
                                            continue;
                                        }
                                    }
                                    else if (separatedContext[0].Contains("year"))
                                    {
                                        //year filter
                                        //if (Convert.ToInt32(separatedContext[1]) >= 1960)
                                        _year = Convert.ToInt32(separatedContext[1]);
                                        //else continue;
                                    }
                                    else if (separatedContext[0].Contains("country"))
                                    {
                                        _country = separatedContext[1];
                                    }
                                    else if (separatedContext[0].Contains("author"))
                                    {
                                        _authorsLine = separatedContext[1];
                                        _authors     = separatedContext[1].Split(authorSeparator, StringSplitOptions.RemoveEmptyEntries);
                                    }
                                    else if (separatedContext[0].Contains("organization"))
                                    {
                                        _organization = separatedContext[1];
                                    }
                                    else if (separatedContext[0].Contains("url"))
                                    {
                                        _url = separatedContext[1];
                                    }
                                    else
                                    {
                                        continue;
                                    }
                                    #endregion
                                }
                                catch (Exception ex)
                                {
                                    //if (ex.InnerException.GetType() == typeof(IndexOutOfRangeException))
                                    //{
                                    //File.WriteAllText(@"F:\\Magistry files\PG_crawler_Log.txt", ex.ToString());
                                    //return;
                                    //}
                                    continue;
                                }
                            }
                        }
                    }
                }
                #region bibtexLibrary

                /*
                 * if(reader.ToString() != null)
                 * {
                 *  string fileEntry = reader.ReadToEnd();
                 *  string fileEntry_filter1 = fileEntry.Replace('*', ' ');
                 *  //string fileEntry_filter2 = fileEntry_filter1.Replace('{', ' ');
                 * // string fileEntry_filter3 = fileEntry_filter2.Replace('}', ' ');
                 *  string fileEntry_filter2 = fileEntry_filter1.Replace('/', ' ');
                 *  if (fileEntry_filter2!=String.Empty && fileEntry_filter2.Contains("title = ") && fileEntry_filter2 != null)
                 *  {
                 *      BibTeXLibrary.BibParser parser = new BibParser(new StringReader(fileEntry));
                 *      var entry = parser.GetAllResult()[0];
                 *      if(!entry.ToString().Contains("publication100010"))
                 *      {
                 *          Console.WriteLine(entry["title"]);
                 *          Console.WriteLine(entry["abstract"]);
                 *          Console.WriteLine(entry["keywords"]);
                 *          Console.WriteLine(entry["year"]);
                 *          Console.WriteLine(entry["author"]);
                 *          Console.WriteLine(entry["organization"]);
                 *          Console.WriteLine(entry["url"]);
                 *      }
                 *      else
                 *      {
                 *          file.Skip(1);
                 *      }
                 *  }
                 *  else if (fileEntry_filter2 == String.Empty || !fileEntry_filter2.Contains("title = ") || fileEntry_filter2 == null)
                 *  {
                 *      file.Skip(1);
                 *  }
                 *  else{
                 *      Console.WriteLine("Error!");
                 *      return;
                 *  }
                 */
                #endregion
                try
                {
                    #region Bibtex_Entity_Object_Creation_Model_First
                    //
                    using (var dbContext = new ArticleDBDataModelContainer())
                    {
                        var document = new StringBuilder();

                        var bibtexArticle = dbContext.PG_ArticlesSet.Create();

                        bibtexArticle.title = _title;
                        if (_title != String.Empty || _title != " " || _title != null)
                        {
                            var termTitle = TextPreparing.TermsPrepataions(_title);
                            document.Append(termTitle);
                        }
                        _title = null;

                        bibtexArticle.abstractText = _abstract;
                        if (_abstract != String.Empty || _abstract != " " || _abstract != null)
                        {
                            var termAbstract = TextPreparing.TermsPrepataions(_abstract);
                            document.Append(termAbstract);
                        }
                        _abstract = null;

                        bibtexArticle.keywords = _keywords;
                        if (_keywords != String.Empty || _keywords != " " || _keywords != null)
                        {
                            var termKeywords = TextPreparing.TermsPrepataions(_keywords);
                            document.Append(termKeywords);
                        }
                        _keywords             = null;
                        bibtexArticle.year    = _year;
                        bibtexArticle.country = _country;
                        _country = null;
                        bibtexArticle.authors = _authorsLine;
                        _authorsLine          = null;
                        //potrzebnie dorobic dodawanie autorow po 2 wartosci z tabeli authors[] do klasy Entity Authors
                        bibtexArticle.organizations = _organization;
                        _organization     = null;
                        bibtexArticle.url = _url;
                        _url = null;


                        for (int i = 0; i <= _authors.Length - 2;)
                        {
                            var authors_of_the_article = dbContext.AuthorSet.Create();
                            authors_of_the_article.author_name     = _authors[i];
                            authors_of_the_article.author_surename = _authors[i + 1];
                            bibtexArticle.Author.Add(authors_of_the_article);
                            i += 2;
                        }

                        dbContext.PG_ArticlesSet.Add(bibtexArticle);

                        var _document = document.ToString().Split(' ', ';', ':', ',');


                        //dodano 11.02
                        for (int p = 0; p < _document.Length; p++)
                        {
                            for (int z = 0; z < not_allowedCharsforArticle.Length; z++)
                            {
                                if (_document[p].Contains(not_allowedCharsforArticle[z]))
                                {
                                    _document[p].Remove(z, 1);
                                }
                            }

                            //dodano 11.02
                            List <string> stringHashSet = new List <string>();
                            stringHashSet = _document.ToList();

                            foreach (var element in stringHashSet)
                            {
                                if (element == String.Empty || element == null || element == " ")
                                {
                                    stringHashSet.Remove(element);
                                }
                                else if (element.Length <= 3)
                                {
                                    stringHashSet.Remove(element);
                                }
                            }

                            new_document = stringHashSet.ToArray();
                        }

                        for (int k = 0; k <= new_document.Length - 1; k++)
                        {
                            var terms = dbContext.Terms_Vocabulary.Create();

                            string   dictionary_text    = File.ReadAllText(@"F:\Magistry files\csv_files\Allowed_term_dictionary.csv");
                            string[] allowed_dictionary = dictionary_text.Split(',', '\n');
                            #region old_cleaning_code_11.02.2018
                            //added 10.02.2018 - cleaninig the article list

                            /*
                             * for (int i = 0; i <= new_document.Length - 1; i++)
                             * {
                             *  for (int j = 0; j <= allowed_dictionary.Length - 1; j++)
                             *  {
                             *      if (new_document[i].Length > 3 && new_document[i].Contains(allowed_dictionary[j]))
                             *      {
                             *          continue;
                             *      }
                             *      else if (new_document[i].Length < 3 && !(new_document[i].Contains(allowed_dictionary[j])))
                             *      {
                             *          new_document.ToList().RemoveAt(i);
                             *      }
                             *  }
                             * }
                             */
                            #endregion
                            #region old_version_11.02.2018
                            //tutaj potrzebnie przepisac id dokumenta w ktorym wystepuje dane slowo
                            //if (new_document[k] != String.Empty || new_document[k] != " " || new_document[k] != null || new_document[k] != Char.IsDigit(' ').ToString())
                            //{
                            //dbContext.Terms_Vocabulary.Where(u)
                            #endregion
                            //var termVocabularyTable = dbContext.Terms_Vocabulary;

                            /* 21.08 dont't work properly - under fix
                             * // 21.08 If need fast but not accurate - don't use this
                             * for (int i=0; i<k; i++)
                             * {
                             *  var query = GetTerms_Vocabulary(dbContext);
                             *  var query_list = new List<Terms_Vocabulary>();
                             *  foreach(var element in query)
                             *  {
                             *      query_list = query.ToList();
                             *  }
                             *  //if (query_list.Count == 0)
                             *  for(int j = 0; i < query_list.Count; j++)
                             *  {
                             *      if (query_list[j].term_value != new_document[k] | !(query_list[j].term_value.Contains(new_document[k])))
                             *      {
                             *          terms.term_value = new_document[k];
                             *          bibtexArticle.Terms_Vocabulary.Add(terms);
                             *      }
                             *      else
                             *          continue;
                             *  }
                             * }
                             * //
                             */

                            terms.term_value = new_document[k];        //-- 21.08 old and fast but not effective
                            //}
                            bibtexArticle.Terms_Vocabulary.Add(terms); //-- 21.08 old and fast but not effective
                        }
                        dbContext.SaveChanges();
                    }
                    #endregion

                    ///<summary>
                    /// BibtexArticle_Entity_Object_Creation
                    /// </summary>
                    #region BibtexArticle_Entity_Object_Creation

                    /*
                     * using (var db = new PublicationsContext())
                     * {
                     *  var bibtexArticle = new BibtexArticle();
                     *  bibtexArticle.title = _title;
                     *  _title = null;
                     *  bibtexArticle.abstractText = _abstract;
                     *  _abstract = null;
                     *  bibtexArticle.keywords = _keywords;
                     *  _keywords = null;
                     *  bibtexArticle.year = _year;
                     *  bibtexArticle.country = _country;
                     *  _country = null;
                     *  bibtexArticle.authors = _authorsLine;
                     *  _authorsLine = null;
                     *  //potrzebnie dorobic dodawanie autorow po 2 wartosci z tabeli authors[] do klasy Entity Authors
                     *  bibtexArticle.organizations = _organization;
                     *  _organization = null;
                     *  bibtexArticle.url = _url;
                     *  _url = null;
                     *
                     *
                     *  var authors_of_the_article = new Authors();
                     *  for (int i = 0; i <= _authors.Length - 2; i++)
                     *  {
                     *      authors_of_the_article.author_name = _authors[i];
                     *      authors_of_the_article.author_surename = _authors[i + 1];
                     *      bibtexArticle.author_Id = authors_of_the_article.author_Id;
                     *      db.Authors.Add(authors_of_the_article);
                     *  }
                     *
                     *  db.PG_Articles.Add(bibtexArticle);
                     *  db.SaveChanges();
                     * }
                     */
                    #endregion
                    Console.WriteLine("End of file! Go to the next ->");
                }
                catch (Exception ex)
                {
                    File.WriteAllText(@"F:\\Magistry files\PG_crawler_Log.txt", ex.ToString());
                }
            }
        }