Exemplo n.º 1
0
        public static List <string> GeodesyClasseOfDocuments_ListCreations()
        {
            List <string> GeodesyClass = new List <string>();

            #region Geodesy_Class_Documnets

            /*
             * GeodesyClass.Add("  Metody analizy obiektowej w badanich środowiska morskiego "+
             *  "  Monografia przedstawia metody klasyfikacji obrazów opierające się na analizie obiektowej. Autorzy prezentują wyniki eksperymentu dające podstawy do oceny analizy obiektowej jako konkurencyjne do klasyfikacji prowadzonej przez człowieka metodami manualnymi. "+
             *  "  GEODEZJA MORSKA GEOMATYKA SYSTEMY INFORMACJI PRZESTRZENNEJ TELEDETEKCJA  ");
             * GeodesyClass.Add("  Metody analizy obiektowej w badanich środowiska morskiego "+
             *  "  Monografia przedstawia metody klasyfikacji obrazów opierające się na analizie obiektowej. Autorzy prezentują wyniki eksperymentu dające podstawy do oceny analizy obiektowej jako konkurencyjne do klasyfikacji prowadzonej przez człowieka metodami manualnymi. Autorzy: Katarzyna Mokwa, Marek Przyborski, Jerzy Pyrchla. Redaktor serii: Jakub Szulwic "+
             *  "  GEODEZJA MORSKA GEOMATYKA SYSTEMY INFORMACJI PRZESTRZENNEJ TELEDETEKCJA  ");
             * GeodesyClass.Add("  Propozycja wykorzystania intensywności do wspomagania przetwarzania oryginalnej i zoptymalizowanej chmury punktów ALS "+
             *  "  Skaning lotniczy i przetwarzanie wyników - optymalizacja i klasyfikacja danych. "+
             *  "  GEODEZJA SKANING LASEROWY  ");
             * GeodesyClass.Add("  M-Split Estimation in Laser Scanning Data Modeling "+
             *  "  Publikacja traktuje o wykorzystaniu estymacji M-Split do modelowania danych pozyskanych w wyniku skaningu laserowego. Autorzy prezentują rozwiązanie w oparciu o detekcję krawędzi dwóch płaszczyzn. "+
             *  "  DETEKCJA KRAWĘDZI GEODEZJA M-SPLIT SKANING LASEROWY  ");
             */
            #endregion

            using (var dbContext = new ArticleDBDataModelContainer())
            {
                var content = dbContext.PG_ArticlesSet.SqlQuery(@"SELECT * FROM dbo.PG_ArticlesSet WHERE (PG_ArticlesSet.abstractText LIKE '%GEODE%' OR PG_ArticlesSet.keywords LIKE '%GEODE%') OR (PG_ArticlesSet.abstractText LIKE '%GEODE%' OR PG_ArticlesSet.keywords LIKE '%GEODE%')");
                foreach (var item in content)
                {
                    GeodesyClass.Add(item.title + item.abstractText + item.keywords);
                }
            }

            return(GeodesyClass);
        }
Exemplo n.º 2
0
        public static void GenerateAuthorsToCSVandJsonFromDB(string authorsCSV, string authorsJson)
        {
            string authorsContentCSV = string.Empty;
            string jsonContent       = "var authors = [";

            authorsContentCSV += authorHeaderCSV;
            using (var AuthorDBContext = new ArticleDBDataModelContainer())
            {
                var authors_Result = AuthorDBContext.AuthorSet.SqlQuery("SELECT * FROM dbo.AuthorSet").ToList();
                if (authors_Result != null)
                {
                    foreach (var item in authors_Result)
                    {
                        AuthorsJsonObj authorsJsonObj = new AuthorsJsonObj(item.author_Id, item.author_name, item.author_surename);
                        authorsContentCSV += ("\"" + item.author_Id + "\",") +
                                             ("\"" + item.author_name + "\",") +
                                             ("\"" + item.author_surename + "\"") + '\n';
                        jsonContent += JsonConvert.SerializeObject(authorsJsonObj) + '\n';
                    }
                }
            }
            jsonContent += "]";
            using (StreamWriter sw = File.AppendText(authorsCSV))
            {
                sw.Write(authorsContentCSV);
            }
            using (StreamWriter json_SW = File.AppendText(authorsJson))
            {
                json_SW.Write(jsonContent);
            }
        }
Exemplo n.º 3
0
        public static string SelectAutorsFromDB(int ArticleID)
        {
            List <string> Author_list = new List <string>();

            using (var article = new ArticleDBDataModelContainer())
            {
                Author_list = article.Database.SqlQuery <string>("SELECT authors FROM dbo.PG_ArticlesSet WHERE article_Id=" + ArticleID.ToString()).ToList();

                if (Author_list.Count < 1)
                {
                    Author_list = article.Database.SqlQuery <string>("SELECT article_author_line FROM dbo.PP_ArticlesSet WHERE article_Id=" + ArticleID.ToString()).ToList();
                }
                else if (Author_list.Count < 1)
                {
                    Author_list = article.Database.SqlQuery <string>("SELECT article_author_line FROM dbo.UG_ArticlesSet WHERE article_Id=" + ArticleID.ToString()).ToList();
                }
                else if (Author_list.Count < 1)
                {
                    Author_list = article.Database.SqlQuery <string>("SELECT article_authors_line FROM dbo.UMK_ArticlesSet WHERE article_Id=" + ArticleID.ToString()).ToList();
                }
                else if (Author_list.Count < 1)
                {
                    Author_list = article.Database.SqlQuery <string>("SELECT article_authors FROM dbo.WSB_ArticlesSet WHERE article_Id=" + ArticleID.ToString()).ToList();
                }
            }
            string Authors = string.Join(", ", Author_list.ToArray());

            return(Authors);
        }
Exemplo n.º 4
0
        public static List <string> SurveyAndMeasurementsClassOfDocuments_ListCreations()
        {
            List <string> SurveyAndMeasurementsClass = new List <string>();

            using (var dbContext = new ArticleDBDataModelContainer())
            {
                var content = dbContext.PG_ArticlesSet.SqlQuery(@"SELECT * FROM dbo.PG_ArticlesSet WHERE (PG_ArticlesSet.abstractText LIKE '%BADAN%' OR PG_ArticlesSet.keywords LIKE '%BADAN%') OR (PG_ArticlesSet.abstractText LIKE '%POMIAR%' OR PG_ArticlesSet.keywords LIKE '%POMIAR%')");
                foreach (var item in content)
                {
                    SurveyAndMeasurementsClass.Add(item.title + item.abstractText + item.keywords);
                }
            }

            return(SurveyAndMeasurementsClass);
        }
Exemplo n.º 5
0
        public static HashSet <string> getTermCollection()
        {
            termCollection = new HashSet <string>();

            using (var dbContext = new ArticleDBDataModelContainer())
            {
                dbContext.Terms_Vocabulary.Load();

                foreach (var terms in dbContext.Terms_Vocabulary.Local)
                {
                    termCollection.Add(terms.term_value.ToLower());
                }
            }
            return(termCollection);
        }
        public static Dictionary <int, string> GenerateDocumentCollection_withoutLazyLoadingToDictionary()
        {
            Dictionary <int, string> DocumentCollection = new Dictionary <int, string>();

            int counter1 = 0;
            int counter2 = 0;
            int counter3 = 0;

            var database_processing = Stopwatch.StartNew();

            using (var dbContext = new ArticleDBDataModelContainer())
            {
                dbContext.PG_ArticlesSet.Load();

                foreach (var PG_articles in dbContext.PG_ArticlesSet.Local)
                {
                    string PG_record = PG_articles.title + PG_articles.abstractText + PG_articles.keywords;
                    if (!(DocumentCollection.ContainsKey(PG_articles.article_Id)) || !(DocumentCollection.ContainsValue(PG_record)))
                    {
                        DocumentCollection.Add(Convert.ToInt32(PG_articles.article_Id), PG_record);
                    }
                    else
                    {
                        continue;
                    }
                    counter1++;
                }

                dbContext.PP_ArticlesSet.Load();

                foreach (var PP_articles in dbContext.PP_ArticlesSet.Local)
                {
                    string PP_record = PP_articles.article_title + PP_articles.article_source;
                    if (!(DocumentCollection.ContainsKey(PP_articles.article_Id)) || !(DocumentCollection.ContainsValue(PP_record)))
                    {
                        DocumentCollection.Add(Convert.ToInt32(PP_articles.article_Id), PP_record);
                    }
                    else
                    {
                        continue;
                    }
                    counter1++;
                }

                dbContext.UG_ArticlesSet.Load();

                foreach (var UG_articles in dbContext.UG_ArticlesSet.Local)
                {
                    string UG_record = UG_articles.article_title + UG_articles.article_source + UG_articles.article_keywords;
                    if (!(DocumentCollection.ContainsKey(UG_articles.article_Id)) || !(DocumentCollection.ContainsValue(UG_record)))
                    {
                        DocumentCollection.Add(Convert.ToInt32(UG_articles.article_Id), UG_record);
                    }
                    else
                    {
                        continue;
                    }
                    counter1++;
                }

                dbContext.UMK_ArticlesSet.Load();

                foreach (var UMK_articles in dbContext.UMK_ArticlesSet.Local)
                {
                    string UMK_record = UMK_articles.article_title + UMK_articles.article_Full_title + UMK_articles.article_eng_keywords + UMK_articles.article_pl_keywords + UMK_articles.article_translated_title;
                    if (!(DocumentCollection.ContainsKey(UMK_articles.article_Id)) || !(DocumentCollection.ContainsValue(UMK_record)))
                    {
                        DocumentCollection.Add(Convert.ToInt32(UMK_articles.article_Id), UMK_record);
                    }
                    else
                    {
                        continue;
                    }
                    counter1++;
                }

                dbContext.WSB_ArticlesSet.Load();

                foreach (var WSB_articles in dbContext.WSB_ArticlesSet.Local)
                {
                    string WSB_record = WSB_articles.article_title + WSB_articles.article_common_title + WSB_articles.article_title_other_lang + WSB_articles.article_eng_keywords + WSB_articles.article_pl_keywords + WSB_articles.article_details;
                    if (DocumentCollection.ContainsKey(WSB_articles.article_Id))
                    {
                        continue;
                    }
                    else
                    {
                        DocumentCollection.Add(Convert.ToInt32(WSB_articles.article_Id), WSB_record);
                    }
                    counter1++;
                }

                counter2++;
            }

            counter3++;
            database_processing.Stop();


            //System.Windows.MessageBox.Show("The database processing time is: " + database_processing.Elapsed.Minutes.ToString() + ":" + database_processing.Elapsed.TotalMilliseconds, "Database processing time" ,System.Windows.MessageBoxButton.OK);
            string processing_log = @"F:\Magistry files\Processing_log.txt";

            using (StreamWriter sw = File.AppendText(processing_log))
            {
                sw.WriteLine(DateTime.Now.ToString() + " The database processing time is: " + database_processing.Elapsed.Minutes.ToString() + ":" + database_processing.Elapsed.TotalMilliseconds.ToString() + ", database context counter: " + counter2.ToString() + ", selection counter in one dbContext: " + counter1.ToString() + ", method executing counter: " + counter3.ToString());
            }

            return(DocumentCollection);
        }
Exemplo n.º 7
0
        /// <summary>
        /// In what way we can chose the classes for document assign?
        /// -The organizations assigned to documents - not all documents has assigned organizations
        /// -Using title/abstract/keywords
        /// How can we automate the process of class assigment do documents?
        /// </summary>
        /// <returns></returns>
        public static List <string> ArchitectureClasseOfDocuments_ListCreations()
        {
            List <string> ArchitectureClass = new List <string>();

            #region Architectrure_Class_Documnets

            /*
             * ArchitectureClass.Add(" architektura "+
             *  "  Wystawa prac graficznych, rysunków i malarstwa obrazujacych rolę sztuk plastycznych w procesie kreowania form architektonicznych. Prezentacja autorskiej metod nauczania: Zapis-Interpretacja-transformacja oraz metody ideograficznej. "+
             *  "  ARCHITEKTURA ARCHITEKTURA WSPÓŁCZESNA IDEOGRAM KOMPOZYCJA SZTUKA  ");
             * ArchitectureClass.Add("  Projekt stanowisk badawczych siłowni kogeneracyjnych i wirującej tarczy " +
             *  "  Projekt budowlany "+
             *  "  ARCHITEKTURA PRZEMYSŁOWA  ");
             * ArchitectureClass.Add("  Projekt fundamentów pod urządzenia siłowni kogeneracyjnych "+
             *  "  Projekt wykonawczy "+
             *  "  ARCHITEKTURA PRZEMYSŁOWA  ");
             * ArchitectureClass.Add("  Budynek mieszkalny wielorodzinny "+
             *  "  Projekt budowlany zamienny "+
             *  "  ARCHITEKTURA MIESZKANIOWA  ");
             * ArchitectureClass.Add("  Projekt zamienny do projektu zmiana funkcji domu mieszkalnego z funkcją agroturystyczną na funkcję hotelową " +
             *  "  Projekt budowlany "+
             *  "  ARCHITEKTURA HOTELOWA  ");
             * ArchitectureClass.Add("  Projekt zamienny - zmiana funkcji domu mieszkalnego z funkcją agroturystyczną na funkcję hotelową "+
             *  "  Projekt budowlany "+
             *  "  ARCHITEKTURA HOTELOWA  ");
             * ArchitectureClass.Add("  Budynek mieszkalny jednorodzinny " +
             *  "  Projekt budowlany " +
             *  "  ARCHITEKTURA MIESZKANIOWA  ");
             * ArchitectureClass.Add("  Areszt śledczy w Starogardzie GdańskimRozbudowa budynku penitencjarnego o pomieszczenia ambulatorium " +
             *  "  Projekt architektoniczno - budowlany. "+
             *  "  ARCHITEKTURA PENITENCJARNA ARCHITEKTURA SŁUZBY ZDROWIA  ");
             * ArchitectureClass.Add("  Przebudowa i rozbudowa budynku Instytutu Pamięci Narodowej ze zmiana funkcji z produkcyjnej na administracyjno-magazynową "+
             *  "  Projekt architektoniczno-budowlany "+
             *  "  ARCHITEKTURA UŻYTECZNOŚCI PUBLICZNEJ  ");
             * ArchitectureClass.Add("  Przebudowa i rozbudowa budynku Instytutu Pamieci Narodowej - Komisja Scigania Zbrodni przeciwko Narodowi Polskiemu ze zmianą funkcji z produlcji na administracyjno-magazynową. "+
             *  "  Projekt architektoniczno-budowlany "+
             *  "  ARCHITEKTURA MIESZKANIOWA  ");
             * ArchitectureClass.Add("  WATER CUBE - inżynieryjna metafora wody "+
             *  "  Artykuł prezentuje innowacyjne rozwiązania w inteligentnym obiekcie pływalni olimpijskiej w Pekinie. "+
             *      "  ARCHITEKTURA OBIEKTÓW SPORTOWYCH BUDYNEK INTELIGENTNY  ");
             * ArchitectureClass.Add("  Budynek hotelu z wbudowaną kotłownią "+
             *  "  Projekt budowlany "+
             *  "  ARCHITEKTURA HOTELOWA  ");
             * ArchitectureClass.Add("  Budynek mieszkalny jednorodzinny "+
             *  "  Projekt budowlany "+
             *  "  ARCHITEKTURA MIESZKANIOWA  ");
             * ArchitectureClass.Add("  Budynek mieszkalny jednorodzinny z podziemnym zbiornikiem na ścieki sanitarne "+
             *  "  Projekt budowlany "+
             *  "  ARCHITEKTURA MIESZKANIOWA  ");
             * ArchitectureClass.Add("  Projekt budowlany remont elewacji oraz docieplenie scian zewnętrznych i stropodachu " +
             *  "  Projekt budowlany "+
             *  "  ARCHITEKTURA  ");
             * ArchitectureClass.Add("  Remont klatki schodowej Ministerstwo Sprawiedliwości "+
             *  "  Projekt wykonawczy "+
             *  "  ARCHITEKTURA UŻYTECZNOŚCI PUBLICZNEJ  ");
             */
            #endregion

            using (var dbContext = new ArticleDBDataModelContainer())
            {
                var content = dbContext.PG_ArticlesSet.SqlQuery(@"SELECT * FROM dbo.PG_ArticlesSet WHERE PG_ArticlesSet.abstractText LIKE '%ARCHITEKT%' OR PG_ArticlesSet.keywords LIKE '%ARCHITEKT%' OR PG_ArticlesSet.abstractText LIKE '%ARCHITEKT%';");
                foreach (var item in content)
                {
                    ArchitectureClass.Add(item.title + item.abstractText + item.keywords);
                }
            }

            return(ArchitectureClass);
        }
Exemplo n.º 8
0
        public static List <DocumentVector> DocumentCollectionProcessing(List <String> collection)
        {
            parallelOption.MaxDegreeOfParallelism = 20;
            var vector_space_model_calculation = Stopwatch.StartNew();

            //dTerms = new HashSet<string>();
            //documentCollection = CreateDocumentCollection.GenerateCollection();

            #region old_parts_of_code

            /*foreach (string documentContent in documentCollection)
             * {
             *  foreach (string term in r.Split(documentContent))
             *  {
             *      if (!StopWordsHandler.IsStotpWord(term))
             *          dTerms.Add(term);
             *      else
             *          continue;
             *  }
             * }
             * List<string> removeList = new List<string>() { "\"", "\r", "\n", "(", ")", "[", "]", "{", "}", "", ".", " ", "," };
             * foreach (string s in removeList)
             * {
             *  dTerms.Remove(s);
             * }*/
            #endregion

            termHashset = new HashSet <string>();

            using (var dbContext = new ArticleDBDataModelContainer())
            {
                dbContext.Terms_Vocabulary.Load();

                foreach (var terms in dbContext.Terms_Vocabulary.Local)
                {
                    termHashset.Add(terms.term_value.ToLower());
                }
            }

            /*
             * foreach(var items in termHashset)
             * {
             *  dTerms.Add(items.ToLower());
             * }
             */

            List <DocumentVector> documentVectorSpace = new List <DocumentVector>();
            DocumentVector        _documentVector;
            float[] space;

            // trying to optimize execution time 04.10.2017
            //foreach (string document in documentCollection)
            Parallel.ForEach(collection, parallelOption, document => {
                int count = 0;
                space     = new float[termHashset.Count];
                //space = new float[dTerms.Count];
                //foreach (string term in dTerms)
                foreach (string term in termHashset)
                {
                    //space[count] = CalculateTFIDF.FindTFIDF(collection, document, term);
                    space[count] = Logic.ClusteringAlgorithms.Used_functions.TFIDF2ndrealization.FindTFIDF(collection, document, term);
                    count++;
                }

                _documentVector = new DocumentVector();

                #region dont_usable_now
                //last changes 21.05.2018

                /*
                 * using (var PGDbContext = new ArticleDBDataModelContainer())
                 * {
                 *  foreach (var article in PGDbContext.PG_ArticlesSet)
                 *  {
                 *      string PG_article = article.title + article.abstractText + article.keywords;
                 *      if (PG_article.Contains(document))
                 *          _documentVector.ArticleID = article.article_Id;
                 *  }
                 * }
                 * using (var PPDbContext = new ArticleDBDataModelContainer())
                 * {
                 *  foreach (var article in PPDbContext.PP_ArticlesSet)
                 *  {
                 *      string PP_record = article.article_title + article.article_source;
                 *      if (PP_record.Contains(document))
                 *          _documentVector.ArticleID = article.article_Id;
                 *  }
                 * }
                 * using (var UMKDbContext = new ArticleDBDataModelContainer())
                 * {
                 *  foreach (var article in UMKDbContext.UMK_ArticlesSet)
                 *  {
                 *      string UMK_record = article.article_title + article.article_Full_title + article.article_eng_keywords + article.article_pl_keywords + article.article_translated_title;
                 *      if (UMK_record.Contains(document))
                 *          _documentVector.ArticleID = article.article_Id;
                 *  }
                 * }
                 * using (var UGDbContext = new ArticleDBDataModelContainer())
                 * {
                 *  foreach (var article in UGDbContext.UG_ArticlesSet)
                 *  {
                 *      string UG_record = article.article_title + article.article_source + article.article_keywords;
                 *      if (UG_record.Contains(document))
                 *          _documentVector.ArticleID = article.article_Id;
                 *  }
                 *
                 * }
                 * using (var WSBDbContext = new ArticleDBDataModelContainer())
                 * {
                 *  foreach (var article in WSBDbContext.WSB_ArticlesSet)
                 *  {
                 *      string WSB_record = article.article_title + article.article_common_title + article.article_title_other_lang + article.article_eng_keywords + article.article_pl_keywords + article.article_details;
                 *      if (WSB_record.Contains(document))
                 *          _documentVector.ArticleID = article.article_Id;
                 *  }
                 * }
                 */
                #endregion

                _documentVector.Content     = document;
                _documentVector.VectorSpace = space;
                _documentVector.index_Of_Doc_for_labeling = collection.IndexOf(document);
                documentVectorSpace.Add(_documentVector);
            });

            /*
             * foreach(string document in collection)
             * {
             *  int count = 0;
             *  space = new float[dTerms.Count];
             *  foreach (string term in dTerms){
             *      space[count] = CalculateTFIDF.FindTFIDF(collection,document, term);
             *      count++;
             *  }
             *
             *  _documentVector = new DocumentVector();
             *  _documentVector.Content = document;
             *  _documentVector.VectorSpace = space;
             *  documentVectorSpace.Add(_documentVector);
             *  //tu mamy 2296 termow
             *  //ClusteringAlgorithms.Used_functions.Normalization.Normilize_Term_Frequency(documentVectorSpace); // are that the correct place to perform normalization?
             *
             * }
             */
            vector_space_model_calculation.Stop();

            string processing_log = @"F:\Magistry files\Processing_log.txt";

            using (StreamWriter sw = File.AppendText(processing_log))
            {
                sw.WriteLine(DateTime.Now.ToString() + " The vector space model calculation time is: " + vector_space_model_calculation.Elapsed.Minutes.ToString() + ":" + vector_space_model_calculation.Elapsed.TotalMilliseconds.ToString());
            }

            return(documentVectorSpace);
        }
Exemplo n.º 9
0
        public static void LoadBibtexFile()
        {
            string[] fileEntries = Directory.GetFiles(filePathBibtex);
            char[]   not_allowedCharsforArticle = { '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '<', '>', 'x', '!', '#', '$', '%', '^', '&', '*', '(', ')', '/', '\'' };
            string[] new_document = new string[0];

            foreach (string file in fileEntries)
            {
                using (StreamReader reader = new StreamReader(File.OpenRead(file)))
                {
                    if (reader.ToString() != null || !reader.ToString().Contains("title ="))
                    {
                        context          = new string[14];
                        separatedContext = new string[2];


                        for (int i = 0; i <= context.Count() - 1; i++)
                        {
                            context[i] = reader.ReadLine();
                            if (context[i] != null || context[i] == "}")
                            {
                                try
                                {
                                    Console.WriteLine("Processing " + i.ToString() + " line.");
                                    context[i]       = context[i].TrimStart(' ').Replace('\"', ' ').Replace('\\', ' ').TrimEnd(',');
                                    separatedContext = context[i].Split(separators, 2, StringSplitOptions.RemoveEmptyEntries);


                                    #region getVariables
                                    if (separatedContext[0].Contains("title"))
                                    {
                                        #region little_modification_for_title_clearing

                                        /*
                                         * for (int a = 0; a < separatedContext[1].Length; a++)
                                         * {
                                         *  for (int b = 0; b < not_allowedCharsforArticle.Length; b++)
                                         *  {
                                         *      if (separatedContext[1].ElementAt(a) == not_allowedCharsforArticle[b])
                                         *          separatedContext[1].Remove(a, 1);
                                         *  }
                                         * }
                                         */
                                        #endregion
                                        if (separatedContext[1].Length >= 2)
                                        {
                                            _title = separatedContext[1];
                                        }
                                    }
                                    else if (separatedContext[0].Contains("abstract"))
                                    {
                                        #region little_modification_for_abstract_clearing
                                        for (int a = 0; a < separatedContext[1].Length; a++)
                                        {
                                            for (int b = 0; b < not_allowedCharsforArticle.Length; b++)
                                            {
                                                if (separatedContext[1].ElementAt(a) == not_allowedCharsforArticle[b])
                                                {
                                                    separatedContext[1].Remove(a, 1);
                                                }
                                            }
                                        }
                                        #endregion
                                        if (separatedContext[1].Length >= 5)
                                        {
                                            _abstract = separatedContext[1];
                                        }
                                    }
                                    else if (separatedContext[0].Contains("keywords"))
                                    {
                                        if (separatedContext[1] != String.Empty || separatedContext[1] != " ")
                                        {
                                            _keywords = separatedContext[1];
                                        }
                                        else
                                        {
                                            continue;
                                        }
                                    }
                                    else if (separatedContext[0].Contains("year"))
                                    {
                                        //year filter
                                        //if (Convert.ToInt32(separatedContext[1]) >= 1960)
                                        _year = Convert.ToInt32(separatedContext[1]);
                                        //else continue;
                                    }
                                    else if (separatedContext[0].Contains("country"))
                                    {
                                        _country = separatedContext[1];
                                    }
                                    else if (separatedContext[0].Contains("author"))
                                    {
                                        _authorsLine = separatedContext[1];
                                        _authors     = separatedContext[1].Split(authorSeparator, StringSplitOptions.RemoveEmptyEntries);
                                    }
                                    else if (separatedContext[0].Contains("organization"))
                                    {
                                        _organization = separatedContext[1];
                                    }
                                    else if (separatedContext[0].Contains("url"))
                                    {
                                        _url = separatedContext[1];
                                    }
                                    else
                                    {
                                        continue;
                                    }
                                    #endregion
                                }
                                catch (Exception ex)
                                {
                                    //if (ex.InnerException.GetType() == typeof(IndexOutOfRangeException))
                                    //{
                                    //File.WriteAllText(@"F:\\Magistry files\PG_crawler_Log.txt", ex.ToString());
                                    //return;
                                    //}
                                    continue;
                                }
                            }
                        }
                    }
                }
                #region bibtexLibrary

                /*
                 * if(reader.ToString() != null)
                 * {
                 *  string fileEntry = reader.ReadToEnd();
                 *  string fileEntry_filter1 = fileEntry.Replace('*', ' ');
                 *  //string fileEntry_filter2 = fileEntry_filter1.Replace('{', ' ');
                 * // string fileEntry_filter3 = fileEntry_filter2.Replace('}', ' ');
                 *  string fileEntry_filter2 = fileEntry_filter1.Replace('/', ' ');
                 *  if (fileEntry_filter2!=String.Empty && fileEntry_filter2.Contains("title = ") && fileEntry_filter2 != null)
                 *  {
                 *      BibTeXLibrary.BibParser parser = new BibParser(new StringReader(fileEntry));
                 *      var entry = parser.GetAllResult()[0];
                 *      if(!entry.ToString().Contains("publication100010"))
                 *      {
                 *          Console.WriteLine(entry["title"]);
                 *          Console.WriteLine(entry["abstract"]);
                 *          Console.WriteLine(entry["keywords"]);
                 *          Console.WriteLine(entry["year"]);
                 *          Console.WriteLine(entry["author"]);
                 *          Console.WriteLine(entry["organization"]);
                 *          Console.WriteLine(entry["url"]);
                 *      }
                 *      else
                 *      {
                 *          file.Skip(1);
                 *      }
                 *  }
                 *  else if (fileEntry_filter2 == String.Empty || !fileEntry_filter2.Contains("title = ") || fileEntry_filter2 == null)
                 *  {
                 *      file.Skip(1);
                 *  }
                 *  else{
                 *      Console.WriteLine("Error!");
                 *      return;
                 *  }
                 */
                #endregion
                try
                {
                    #region Bibtex_Entity_Object_Creation_Model_First
                    //
                    using (var dbContext = new ArticleDBDataModelContainer())
                    {
                        var document = new StringBuilder();

                        var bibtexArticle = dbContext.PG_ArticlesSet.Create();

                        bibtexArticle.title = _title;
                        if (_title != String.Empty || _title != " " || _title != null)
                        {
                            var termTitle = TextPreparing.TermsPrepataions(_title);
                            document.Append(termTitle);
                        }
                        _title = null;

                        bibtexArticle.abstractText = _abstract;
                        if (_abstract != String.Empty || _abstract != " " || _abstract != null)
                        {
                            var termAbstract = TextPreparing.TermsPrepataions(_abstract);
                            document.Append(termAbstract);
                        }
                        _abstract = null;

                        bibtexArticle.keywords = _keywords;
                        if (_keywords != String.Empty || _keywords != " " || _keywords != null)
                        {
                            var termKeywords = TextPreparing.TermsPrepataions(_keywords);
                            document.Append(termKeywords);
                        }
                        _keywords             = null;
                        bibtexArticle.year    = _year;
                        bibtexArticle.country = _country;
                        _country = null;
                        bibtexArticle.authors = _authorsLine;
                        _authorsLine          = null;
                        //potrzebnie dorobic dodawanie autorow po 2 wartosci z tabeli authors[] do klasy Entity Authors
                        bibtexArticle.organizations = _organization;
                        _organization     = null;
                        bibtexArticle.url = _url;
                        _url = null;


                        for (int i = 0; i <= _authors.Length - 2;)
                        {
                            var authors_of_the_article = dbContext.AuthorSet.Create();
                            authors_of_the_article.author_name     = _authors[i];
                            authors_of_the_article.author_surename = _authors[i + 1];
                            bibtexArticle.Author.Add(authors_of_the_article);
                            i += 2;
                        }

                        dbContext.PG_ArticlesSet.Add(bibtexArticle);

                        var _document = document.ToString().Split(' ', ';', ':', ',');


                        //dodano 11.02
                        for (int p = 0; p < _document.Length; p++)
                        {
                            for (int z = 0; z < not_allowedCharsforArticle.Length; z++)
                            {
                                if (_document[p].Contains(not_allowedCharsforArticle[z]))
                                {
                                    _document[p].Remove(z, 1);
                                }
                            }

                            //dodano 11.02
                            List <string> stringHashSet = new List <string>();
                            stringHashSet = _document.ToList();

                            foreach (var element in stringHashSet)
                            {
                                if (element == String.Empty || element == null || element == " ")
                                {
                                    stringHashSet.Remove(element);
                                }
                                else if (element.Length <= 3)
                                {
                                    stringHashSet.Remove(element);
                                }
                            }

                            new_document = stringHashSet.ToArray();
                        }

                        for (int k = 0; k <= new_document.Length - 1; k++)
                        {
                            var terms = dbContext.Terms_Vocabulary.Create();

                            string   dictionary_text    = File.ReadAllText(@"F:\Magistry files\csv_files\Allowed_term_dictionary.csv");
                            string[] allowed_dictionary = dictionary_text.Split(',', '\n');
                            #region old_cleaning_code_11.02.2018
                            //added 10.02.2018 - cleaninig the article list

                            /*
                             * for (int i = 0; i <= new_document.Length - 1; i++)
                             * {
                             *  for (int j = 0; j <= allowed_dictionary.Length - 1; j++)
                             *  {
                             *      if (new_document[i].Length > 3 && new_document[i].Contains(allowed_dictionary[j]))
                             *      {
                             *          continue;
                             *      }
                             *      else if (new_document[i].Length < 3 && !(new_document[i].Contains(allowed_dictionary[j])))
                             *      {
                             *          new_document.ToList().RemoveAt(i);
                             *      }
                             *  }
                             * }
                             */
                            #endregion
                            #region old_version_11.02.2018
                            //tutaj potrzebnie przepisac id dokumenta w ktorym wystepuje dane slowo
                            //if (new_document[k] != String.Empty || new_document[k] != " " || new_document[k] != null || new_document[k] != Char.IsDigit(' ').ToString())
                            //{
                            //dbContext.Terms_Vocabulary.Where(u)
                            #endregion
                            //var termVocabularyTable = dbContext.Terms_Vocabulary;

                            /* 21.08 dont't work properly - under fix
                             * // 21.08 If need fast but not accurate - don't use this
                             * for (int i=0; i<k; i++)
                             * {
                             *  var query = GetTerms_Vocabulary(dbContext);
                             *  var query_list = new List<Terms_Vocabulary>();
                             *  foreach(var element in query)
                             *  {
                             *      query_list = query.ToList();
                             *  }
                             *  //if (query_list.Count == 0)
                             *  for(int j = 0; i < query_list.Count; j++)
                             *  {
                             *      if (query_list[j].term_value != new_document[k] | !(query_list[j].term_value.Contains(new_document[k])))
                             *      {
                             *          terms.term_value = new_document[k];
                             *          bibtexArticle.Terms_Vocabulary.Add(terms);
                             *      }
                             *      else
                             *          continue;
                             *  }
                             * }
                             * //
                             */

                            terms.term_value = new_document[k];        //-- 21.08 old and fast but not effective
                            //}
                            bibtexArticle.Terms_Vocabulary.Add(terms); //-- 21.08 old and fast but not effective
                        }
                        dbContext.SaveChanges();
                    }
                    #endregion

                    ///<summary>
                    /// BibtexArticle_Entity_Object_Creation
                    /// </summary>
                    #region BibtexArticle_Entity_Object_Creation

                    /*
                     * using (var db = new PublicationsContext())
                     * {
                     *  var bibtexArticle = new BibtexArticle();
                     *  bibtexArticle.title = _title;
                     *  _title = null;
                     *  bibtexArticle.abstractText = _abstract;
                     *  _abstract = null;
                     *  bibtexArticle.keywords = _keywords;
                     *  _keywords = null;
                     *  bibtexArticle.year = _year;
                     *  bibtexArticle.country = _country;
                     *  _country = null;
                     *  bibtexArticle.authors = _authorsLine;
                     *  _authorsLine = null;
                     *  //potrzebnie dorobic dodawanie autorow po 2 wartosci z tabeli authors[] do klasy Entity Authors
                     *  bibtexArticle.organizations = _organization;
                     *  _organization = null;
                     *  bibtexArticle.url = _url;
                     *  _url = null;
                     *
                     *
                     *  var authors_of_the_article = new Authors();
                     *  for (int i = 0; i <= _authors.Length - 2; i++)
                     *  {
                     *      authors_of_the_article.author_name = _authors[i];
                     *      authors_of_the_article.author_surename = _authors[i + 1];
                     *      bibtexArticle.author_Id = authors_of_the_article.author_Id;
                     *      db.Authors.Add(authors_of_the_article);
                     *  }
                     *
                     *  db.PG_Articles.Add(bibtexArticle);
                     *  db.SaveChanges();
                     * }
                     */
                    #endregion
                    Console.WriteLine("End of file! Go to the next ->");
                }
                catch (Exception ex)
                {
                    File.WriteAllText(@"F:\\Magistry files\PG_crawler_Log.txt", ex.ToString());
                }
            }
        }
Exemplo n.º 10
0
        public static void get_PP_Document_content()
        {
            string[] PP_newcontent       = new string[hapDoc.DocumentNode.InnerText.Length];
            string[] PP_separatedContent = new string[hapDoc.DocumentNode.InnerText.Length];

            PP_articles_Count = 0;
            string[] PP_articles_Matrix = { String.Empty };

            using (StringReader sr = new StringReader(endText))
            {
                int    p = 0;
                string PP_line;

                while ((PP_line = sr.ReadLine()) != null)
                {
                    PP_newcontent[p]    = PP_line;
                    PP_separatedContent = PP_line.Split(line_separator, 2);

                    if (PP_separatedContent.Length == 1 & PP_separatedContent[0] == "")
                    {
                        continue;
                    }
                    else if (PP_separatedContent.Length == 1 && PP_articles_Matrix.Any(x => PP_separatedContent[0].Contains(x)))
                    {
                        if (PP_author_line != null && PP_Tytul != null)
                        {
                            try
                            {
                                using (var PPdbContext = new ArticleDBDataModelContainer())
                                {
                                    var document   = new StringBuilder();
                                    var pp_article = PPdbContext.PP_ArticlesSet.Create();

                                    pp_article.article_author_line = PP_author_line;
                                    PP_author_line = null;

                                    pp_article.article_title = PP_Tytul;
                                    if (PP_Tytul != String.Empty || PP_Tytul != " " || PP_Tytul != null)
                                    {
                                        var termTitlePP = TextPreparing.TermsPrepataions(PP_Tytul);
                                        document.Append(termTitlePP);
                                    }
                                    PP_Tytul = null;

                                    pp_article.article_source = PP_Zrodlo;
                                    if (PP_Zrodlo != String.Empty || PP_Zrodlo != " " || PP_Zrodlo != null)
                                    {
                                        var termSourcePP = TextPreparing.TermsPrepataions(PP_Zrodlo);
                                        document.Append(termSourcePP);
                                    }
                                    else
                                    {
                                        PP_Zrodlo = "Not defined";
                                        document.Append(PP_Zrodlo);
                                    }
                                    PP_Zrodlo = null;

                                    pp_article.article_year = PP_Rok;
                                    PP_Rok = 0;
                                    pp_article.article_language = PP_Jezyk_Publikacji;
                                    PP_Jezyk_Publikacji         = null;
                                    pp_article.article_DOI      = PP_DOI;
                                    PP_DOI = null;

                                    /*
                                     * pp_article.article_details = PP_Uwagi;
                                     * PP_Uwagi = null;
                                     * pp_article.article_URL = PP_Adres_URL;
                                     * PP_Adres_URL = null;
                                     */

                                    for (int z = 0; z <= PP_autors.Length - 4;)
                                    {
                                        var authors_of_the_PP_article = PPdbContext.AuthorSet.Create();
                                        if (PP_autors[z] != "IC)")
                                        {
                                            authors_of_the_PP_article.author_name     = PP_autors[z + 1];
                                            authors_of_the_PP_article.author_surename = PP_autors[z];
                                            pp_article.Author.Add(authors_of_the_PP_article);
                                        }
                                        z += 4;
                                    }
                                    PPdbContext.PP_ArticlesSet.Add(pp_article);

                                    var _document = document.ToString().Split(' ', ';', ':', ',');
                                    for (int k = 0; k <= _document.Length - 1; k++)
                                    {
                                        var terms = PPdbContext.Terms_Vocabulary.Create();

                                        string   dictionary_text    = File.ReadAllText(@"F:\Magistry files\csv_files\Allowed_term_dictionary.csv");
                                        string[] allowed_dictionary = dictionary_text.Split(',', '\n');

                                        for (int d = 0; d <= _document.Length - 1; d++)
                                        {
                                            for (int j = 0; j <= allowed_dictionary.Length - 1; j++)
                                            {
                                                if (_document[d].Length > 3 && _document[d].Contains(allowed_dictionary[j]))
                                                {
                                                    continue;
                                                }
                                                else if (_document[d].Length <= 3 && !(_document[d].Contains(allowed_dictionary[j])))
                                                {
                                                    _document.ToList().RemoveAt(d);
                                                }
                                            }
                                        }
                                        //tutaj potrzebnie przepisac id dokumenta w ktorym wystepuje dane slowo
                                        if (_document[k] != String.Empty || _document[k] != " " || _document[k] != null || _document[k] != Char.IsDigit(' ').ToString())
                                        {
                                            //dbContext.Terms_Vocabulary.Where(u)
                                            var termVocabularyTable = PPdbContext.Terms_Vocabulary;
                                            terms.term_value = _document[k];
                                        }
                                        pp_article.Terms_Vocabulary.Add(terms);
                                    }
                                    PPdbContext.SaveChanges();
                                }
                            }
                            catch (Exception ex)
                            {
                                File.WriteAllText(@"F:\\Magistry files\PP_crawler_Log.txt", ex.ToString());
                            }
                        }
                        else
                        {
                            File.WriteAllText(@"F:\\Magistry files\PP_crawler_Log.txt", "Empty line detected." + '\n');
                        }
                    }
                    else if (PP_separatedContent.Length == 2 && (PP_separatedContent[0].Contains("Liczba odnalezionych") || PP_separatedContent[0] == "Liczba odnalezionych rekordow"))
                    {
                        PP_articles_Count  = Convert.ToInt32(PP_separatedContent[1]);
                        PP_articles_Matrix = new string[PP_articles_Count];
                        for (int l = 0; l <= PP_articles_Count - 1; l++)
                        {
                            PP_articles_Matrix[l] = (l + 1) + ".";
                        }
                    }
                    if (PP_separatedContent.Length == 2 && (PP_separatedContent[0].ToLower().Contains("autor") || PP_separatedContent[0].Contains("Autor") || PP_separatedContent[0] == "Autor"))
                    {
                        PP_author_line = PP_separatedContent[1];
                        var PP_author_line_modified = PP_author_line.Replace("(", String.Empty);
                        PP_autors = PP_separatedContent[1].Split(autor_separators, StringSplitOptions.RemoveEmptyEntries);
                    }
                    else if (PP_separatedContent.Length == 2 && (PP_separatedContent[0].ToLower().Contains("tytu") || PP_separatedContent[0].ToLower().Contains("tytul") || PP_separatedContent[0].Contains("Tytul")))
                    {
                        PP_Tytul = PP_separatedContent[1];
                    }
                    else if (PP_separatedContent.Length == 2 && (PP_separatedContent[0].Contains("Zrodlo") || PP_separatedContent[0].ToLower().Contains("zrodlo")))
                    {
                        PP_Zrodlo = PP_separatedContent[1];
                    }
                    else if (PP_separatedContent.Length == 2 && (PP_separatedContent[0].Contains("Rok") || PP_separatedContent[0].ToLower().Contains("rok")))
                    {
                        string rok = "";
                        if (PP_separatedContent[1] != "" | PP_separatedContent[1] == String.Empty)
                        {
                            rok = null;
                        }
                        else
                        {
                            rok = PP_separatedContent[1].Substring(0, 5);
                        }

                        PP_Rok = Convert.ToInt32(rok);
                    }
                    else if (PP_separatedContent.Length == 2 && (PP_separatedContent[0].Contains("Jezyk publikacji") || PP_separatedContent[0].ToLower().Contains("jezyk publikacji") || PP_separatedContent[0].Contains("Język publikacji") || PP_separatedContent[0].ToLower().Contains("język publikacji")))
                    {
                        PP_Jezyk_Publikacji = PP_separatedContent[1];
                    }
                    else if (PP_separatedContent.Length == 2 && (PP_separatedContent[0].Contains("DOI") || PP_separatedContent[0].ToLower().Contains("doi") || PP_separatedContent[0] == "DOI"))
                    {
                        PP_DOI = PP_separatedContent[1];
                    }
                    p++;
                }
                #region Old_code

                /* 22.08.2018 - old version
                 * for (int i = 0; i <= hapDoc.DocumentNode.InnerText.Length; i++)
                 * {
                 *  PP_line = sr.ReadLine();
                 *  int counter = 0;
                 *  if (PP_line != null)
                 *  {
                 *      PP_newcontent[i] = PP_line;
                 *      PP_separatedContent = PP_line.Split(line_separator,2);
                 *
                 *
                 *      if (PP_separatedContent.Length == 2 && (PP_separatedContent[0].ToLower().Contains("autor") || PP_separatedContent[0].Contains("Autor") || PP_separatedContent[0] == "Autor"))
                 *      {
                 *          //System.Windows.MessageBox.Show(PP_separatedContent[1]);
                 *          PP_author_line = PP_separatedContent[1];
                 *          var PP_author_line_modified = PP_author_line.Replace("(", String.Empty);
                 *
                 *          PP_autors = PP_separatedContent[1].Split(autor_separators, StringSplitOptions.RemoveEmptyEntries);
                 *
                 *      }
                 *      else if (PP_separatedContent.Length == 2 && (PP_separatedContent[0].Contains("Liczba odnalezionych") || PP_separatedContent[0] == "Liczba odnalezionych rekordow"))
                 *      {
                 *          PP_articles_Count = Convert.ToInt32(PP_separatedContent[1]);
                 *          PP_articles_Matrix = new string[PP_articles_Count];
                 *          for (int l = 0; l <= PP_articles_Count - 1; l++)
                 *          {
                 *              PP_articles_Matrix[l] = (l + 1) + ".";
                 *          }
                 *      }
                 *      else if (PP_separatedContent.Length == 1 && PP_articles_Matrix.Any(x => PP_separatedContent[0].Contains(x)))
                 *      {
                 *          if (PP_author_line != null && PP_Tytul != null)
                 *          {
                 *              ///<summary>
                 *              ///PPArticle_Entity_Object_creation_Model_first
                 *              /// </summary>
                 *              try
                 *              {
                 #region PP_Article_Object_creation_Model_First
                 *                  using (var PPdbContext = new ArticleDBDataModelContainer())
                 *                  {
                 *                      var document = new StringBuilder();
                 *                      var pp_article = PPdbContext.PP_ArticlesSet.Create();
                 *
                 *                      pp_article.article_author_line = PP_author_line;
                 *                      PP_author_line = null;
                 *
                 *                      pp_article.article_title = PP_Tytul;
                 *                      if (PP_Tytul != String.Empty || PP_Tytul != " " || PP_Tytul != null)
                 *                      {
                 *                          var termTitlePP = TextPreparing.TermsPrepataions(PP_Tytul);
                 *                          document.Append(termTitlePP);
                 *                      }
                 *                      PP_Tytul = null;
                 *
                 *                      pp_article.article_source = PP_Zrodlo;
                 *                      if (PP_Zrodlo != String.Empty || PP_Zrodlo != " " || PP_Zrodlo != null)
                 *                      {
                 *                          var termSourcePP = TextPreparing.TermsPrepataions(PP_Zrodlo);
                 *                          document.Append(termSourcePP);
                 *                      }
                 *                      PP_Zrodlo = null;
                 *
                 *                      pp_article.article_year = PP_Rok;
                 *                      PP_Rok = 0;
                 *                      pp_article.article_language = PP_Jezyk_Publikacji;
                 *                      PP_Jezyk_Publikacji = null;
                 *                      pp_article.article_DOI = PP_DOI;
                 *                      PP_DOI = null;
                 *                      //
                 *                      pp_article.article_details = PP_Uwagi;
                 *                      PP_Uwagi = null;
                 *                      pp_article.article_URL = PP_Adres_URL;
                 *                      PP_Adres_URL = null;
                 *                      //
                 *
                 * for (int z = 0; z <= PP_autors.Length - 4;)
                 *                      {
                 *                          var authors_of_the_PP_article = PPdbContext.AuthorSet.Create();
                 *                          if (PP_autors[z] != "IC)")
                 *                          {
                 *                              authors_of_the_PP_article.author_name = PP_autors[z + 1];
                 *                              authors_of_the_PP_article.author_surename = PP_autors[z];
                 *                              pp_article.Author.Add(authors_of_the_PP_article);
                 *                          }
                 *                          z += 4;
                 *                      }
                 *                      PPdbContext.PP_ArticlesSet.Add(pp_article);
                 *
                 *                      var _document = document.ToString().Split(' ', ';', ':', ',');
                 *                      for (int k = 0; k <= _document.Length - 1; k++)
                 *                      {
                 *                          var terms = PPdbContext.Terms_Vocabulary.Create();
                 *
                 *                          //
                 *                          string dictionary_text = File.ReadAllText(@"F:\Magistry files\csv_files\Allowed_term_dictionary.csv");
                 *                          string[] allowed_dictionary = dictionary_text.Split(',', '\n');
                 *
                 *                          for (int p = 0; p <= _document.Length - 1; p++)
                 *                          {
                 *                              for (int j = 0; j <= allowed_dictionary.Length - 1; j++)
                 *                              {
                 *                                  if (_document[p].Length > 3 && _document[p].Contains(allowed_dictionary[j]))
                 *                                  {
                 *                                      continue;
                 *                                  }
                 *                                  else if (_document[p].Length <= 3 && !(_document[p].Contains(allowed_dictionary[j])))
                 *                                  {
                 *                                      _document.ToList().RemoveAt(p);
                 *                                  }
                 *
                 *                              }
                 *                          }
                 *                          //tutaj potrzebnie przepisac id dokumenta w ktorym wystepuje dane slowo
                 *                          if (_document[k] != String.Empty || _document[k] != " " || _document[k] != null || _document[k] != Char.IsDigit(' ').ToString())
                 *                          {
                 *                              //dbContext.Terms_Vocabulary.Where(u)
                 *                              var termVocabularyTable = PPdbContext.Terms_Vocabulary;
                 *                              terms.term_value = _document[k];
                 *
                 *                          }
                 *                          pp_article.Terms_Vocabulary.Add(terms);
                 *                      }
                 *
                 *                      PPdbContext.SaveChanges();
                 *                  }
                 #endregion
                 *              }
                 *              catch (Exception ex)
                 *              {
                 *                  File.WriteAllText(@"F:\\Magistry files\PP_crawler_Log.txt", ex.ToString());
                 *              }
                 *              ///<summary>
                 *              /// PPArticle_Entity_Object_Creation
                 *              /// </summary>
                 #region PPArticle_Entity_Object_Creation
                 *              /*
                 *              using (var dbppcontext = new PublicationsContext())
                 *              {
                 *                  var pp_article = new PPArticle();
                 *                  pp_article.article_author_line = PP_author_line;
                 *                  PP_author_line = null;
                 *                  pp_article.article_title = PP_Tytul;
                 *                  PP_Tytul = null;
                 *                  pp_article.article_source = PP_Zrodlo;
                 *                  PP_Zrodlo = null;
                 *                  pp_article.article_year = PP_Rok;
                 *                  PP_Rok = 0;
                 *                  pp_article.article_language = PP_Jezyk_Publikacji;
                 *                  PP_Jezyk_Publikacji = null;
                 *                  pp_article.article_DOI = PP_DOI;
                 *                  PP_DOI = null;
                 *                  pp_article.article_details = PP_Uwagi;
                 *                  PP_Uwagi = null;
                 *                  pp_article.article_URL = PP_Adres_URL;
                 *                  PP_Adres_URL = null;
                 *
                 *
                 *
                 *                  var authors_of_the_article = new Authors();
                 *                  for (int k = 0; k <= PP_autors.Length - 2; k++)
                 *                  {
                 *                      authors_of_the_article.author_name = PP_autors[k];
                 *                      authors_of_the_article.author_surename = PP_autors[k + 1];
                 *                      dbppcontext.Authors.Add(authors_of_the_article);
                 *
                 *                  }
                 *                  //dbppcontext.PP_Articles.Add(pp_article);
                 *                  dbppcontext.PP_Articles.Attach(pp_article);
                 *                  dbppcontext.Entry(pp_article).State = System.Data.Entity.EntityState.Added;
                 *                  dbppcontext.SaveChanges();
                 *                  //dbppcontext.SaveChanges();
                 *              }
                 *              //
                 *              //#endregion
                 *          }
                 *          else
                 *          {
                 *              //System.Windows.MessageBox.Show("Brak danych");
                 *          }
                 *
                 *      }
                 *      else if (PP_separatedContent.Length == 2 && (PP_separatedContent[0].ToLower().Contains("tytu") || PP_separatedContent[0].ToLower().Contains("tytul") || PP_separatedContent[0].Contains("Tytul")))
                 *      {
                 *          PP_Tytul = PP_separatedContent[1];
                 *          //System.Windows.MessageBox.Show(PP_Tytul);
                 *      }
                 *      else if (PP_separatedContent.Length == 2 && (PP_separatedContent[0].Contains("Zrodlo") || PP_separatedContent[0].ToLower().Contains("zrodlo")))
                 *      {
                 *          PP_Zrodlo = PP_separatedContent[1];
                 *          //System.Windows.MessageBox.Show(PP_Zrodlo);
                 *      }
                 *      else if (PP_separatedContent.Length == 2 && (PP_separatedContent[0].Contains("Rok") || PP_separatedContent[0].ToLower().Contains("rok")))
                 *      {
                 *          var rok = PP_separatedContent[1].Substring(0, 5);
                 *          PP_Rok = Convert.ToInt32(rok);
                 *          //System.Windows.MessageBox.Show(PP_Rok.ToString());
                 *      }
                 *      else if (PP_separatedContent.Length == 2 && (PP_separatedContent[0].Contains("Jezyk publikacji") || PP_separatedContent[0].ToLower().Contains("jezyk publikacji") || PP_separatedContent[0].Contains("Język publikacji") || PP_separatedContent[0].ToLower().Contains("język publikacji")))
                 *      {
                 *          PP_Jezyk_Publikacji = PP_separatedContent[1];
                 *          //System.Windows.MessageBox.Show(PP_Jezyk_Publikacji);
                 *      }
                 *      else if (PP_separatedContent.Length == 2 && (PP_separatedContent[0].Contains("DOI") || PP_separatedContent[0].ToLower().Contains("doi") || PP_separatedContent[0] == "DOI"))
                 *      {
                 *          PP_DOI = PP_separatedContent[1];
                 *          //System.Windows.MessageBox.Show(PP_DOI);
                 *      }
                 *      /*
                 *      else if (PP_separatedContent.Length == 2 && (PP_separatedContent[0].Contains("Uwagi") || PP_separatedContent[0].ToLower().Contains("uwagi") || PP_separatedContent[0] == "Uwagi"))
                 *      {
                 *          PP_Uwagi = PP_separatedContent[1];
                 *          System.Windows.MessageBox.Show(PP_Uwagi);
                 *      }
                 *      else if (PP_separatedContent.Length == 2 && (PP_separatedContent[0].Contains("Adres url") || PP_separatedContent[0].ToLower().Contains("adres url") || PP_separatedContent[0] == "Adres url"))
                 *      {
                 *          PP_Adres_URL = PP_separatedContent[1];
                 *          System.Windows.MessageBox.Show(PP_Adres_URL = PP_separatedContent[1]);
                 *      }
                 *      //
                 *
                 *      //else if (PP_separatedContent.Length == 1 && PP_separatedContent[0] == String.Empty) System.Windows.MessageBox.Show("The empty line detected", "Empty line", System.Windows.MessageBoxButton.OK);
                 *      else
                 *      {
                 *          //System.Windows.MessageBox.Show("Error! Content not found!", "Error!", System.Windows.MessageBoxButton.OK);
                 *
                 *      }
                 *      counter++;
                 *  }
                 * }
                 */
                #endregion
            }
        }
Exemplo n.º 11
0
        public static void get_UMK_Document_content()
        {
            string[] UMK_newcontent       = new string[hapDoc.DocumentNode.InnerText.Length];
            string[] UMK_separatedContent = new string[hapDoc.DocumentNode.InnerText.Length];

            UMK_articles_Count = 0;
            string[] PP_articles_Matrix = { String.Empty };

            using (StringReader sr = new StringReader(endText))
            {
                string UMK_line;
                for (int i = 0; i <= hapDoc.DocumentNode.InnerText.Length; i++)
                {
                    UMK_line = sr.ReadLine();
                    if (UMK_line != null)
                    {
                        UMK_newcontent[i]    = UMK_line;
                        UMK_separatedContent = UMK_line.Split(line_separator, 2);
                        //tutaj idzie funkcjonalnosc
                        if (UMK_separatedContent.Length == 1 & UMK_separatedContent[0] == "")
                        {
                            continue;
                        }
                        else if (UMK_separatedContent.Length == 1 & PP_articles_Matrix.Any(x => UMK_separatedContent[0].Contains(x)))
                        {
                            if (UMK_author_line != null && UMK_Tytul != null)
                            {
                                using (var dbContext = new ArticleDBDataModelContainer())
                                {
                                    var document    = new StringBuilder();
                                    var umk_article = dbContext.UMK_ArticlesSet.Create();

                                    if (UMK_author_line == null)
                                    {
                                        UMK_author_line = "Not_defined";
                                    }
                                    umk_article.article_author_line = UMK_author_line;
                                    UMK_author_line = null;

                                    if (UMK_Tytul == null)
                                    {
                                        UMK_Tytul = "Not_defined";
                                    }
                                    umk_article.article_title = UMK_Tytul;
                                    if (UMK_Tytul != String.Empty | UMK_Tytul != " " | UMK_Tytul != null)
                                    {
                                        var termTitle_UMK = TextPreparing.TermsPrepataions(UMK_Tytul);
                                        document.Append(termTitle_UMK);
                                    }
                                    UMK_Tytul = null;

                                    if (UMK_Pelny_tytul_czasop == null)
                                    {
                                        UMK_Pelny_tytul_czasop = "Not_defined";
                                    }
                                    umk_article.article_Full_title = UMK_Pelny_tytul_czasop;
                                    if (UMK_Pelny_tytul_czasop != String.Empty | UMK_Pelny_tytul_czasop != " " | UMK_Pelny_tytul_czasop != null)
                                    {
                                        var termFullTitle_UMK = TextPreparing.TermsPrepataions(UMK_Pelny_tytul_czasop);
                                        document.Append(termFullTitle_UMK);
                                    }
                                    UMK_Pelny_tytul_czasop = null;

                                    if (UMK_Jezyk_Publikacji == null)
                                    {
                                        UMK_Jezyk_Publikacji = "Not_defined";
                                    }
                                    umk_article.article_language = UMK_Jezyk_Publikacji;
                                    UMK_Jezyk_Publikacji         = null;

                                    if (UMK_Tytul_rownolegly == null)
                                    {
                                        UMK_Tytul_rownolegly = "Not_defined";
                                    }
                                    umk_article.article_translated_title = UMK_Tytul_rownolegly;
                                    if (UMK_Tytul_rownolegly != String.Empty | UMK_Tytul_rownolegly != " " | UMK_Tytul_rownolegly != null)
                                    {
                                        var termParallelTitle_UMK = TextPreparing.TermsPrepataions(UMK_Tytul_rownolegly);
                                        document.Append(termParallelTitle_UMK);
                                    }
                                    UMK_Tytul_rownolegly = null;

                                    if (UMK_en_keywords_line == null)
                                    {
                                        UMK_en_keywords_line = "Not_defined";
                                    }
                                    umk_article.article_eng_keywords = UMK_en_keywords_line;
                                    if (UMK_en_keywords_line != String.Empty | UMK_en_keywords_line != " " | UMK_en_keywords_line != null)
                                    {
                                        var term_Eng_Keywords_UMK = TextPreparing.TermsPrepataions(UMK_en_keywords_line);
                                        document.Append(term_Eng_Keywords_UMK);
                                    }
                                    UMK_en_keywords_line = null;

                                    if (UMK_pl_keywords_line == null)
                                    {
                                        UMK_pl_keywords_line = "Not_defined";
                                    }
                                    umk_article.article_pl_keywords = UMK_pl_keywords_line;
                                    if (UMK_pl_keywords_line != String.Empty | UMK_pl_keywords_line != " " | UMK_pl_keywords_line != null)
                                    {
                                        var term_PL_Keywords_UMK = TextPreparing.TermsPrepataions(UMK_pl_keywords_line);
                                        document.Append(term_PL_Keywords_UMK);
                                    }
                                    UMK_pl_keywords_line = null;

                                    if (UMK_Adres_URL == null)
                                    {
                                        UMK_Adres_URL = "Not_defined";
                                    }
                                    umk_article.article_url = UMK_Adres_URL;
                                    UMK_Adres_URL           = null;

                                    if (UMK_Tytul_Wydawn_Zbior == null)
                                    {
                                        UMK_Tytul_Wydawn_Zbior = "Not_defined";
                                    }
                                    umk_article.article_publisher_title = UMK_Tytul_Wydawn_Zbior;
                                    UMK_Tytul_Wydawn_Zbior = null;

                                    if (UMK_Opis_wydawn == null)
                                    {
                                        UMK_Opis_wydawn = "Not_defined";
                                    }
                                    umk_article.article_publisher_desc = UMK_Opis_wydawn;
                                    UMK_Opis_wydawn = null;


                                    for (int k = 0; k <= UMK_autors.Length - 2;)
                                    {
                                        var authors_of_the_article = dbContext.AuthorSet.Create();
                                        authors_of_the_article.author_name     = UMK_autors[k];
                                        authors_of_the_article.author_surename = UMK_autors[k + 1];
                                        umk_article.Author.Add(authors_of_the_article);
                                        k += 2;
                                    }

                                    dbContext.UMK_ArticlesSet.Add(umk_article);
                                    //dbContext.Configuration.ValidateOnSaveEnabled = false;

                                    var _document = document.ToString().Split(' ', ';', ':', ',');
                                    for (int k = 0; k <= _document.Length - 1; k++)
                                    {
                                        var      terms              = dbContext.Terms_Vocabulary.Create();
                                        string   dictionary_text    = File.ReadAllText(@"F:\Magistry files\csv_files\Allowed_term_dictionary.csv");
                                        string[] allowed_dictionary = dictionary_text.Split(',', '\n');

                                        for (int p = 0; p <= _document.Length - 1; p++)
                                        {
                                            for (int j = 0; j <= allowed_dictionary.Length - 1; j++)
                                            {
                                                if (_document[p].Length > 3 && _document[p].Contains(allowed_dictionary[j]))
                                                {
                                                    continue;
                                                }
                                                else if (_document[p].Length <= 3 && !(_document[p].Contains(allowed_dictionary[j])))
                                                {
                                                    _document.ToList().RemoveAt(p);
                                                }
                                            }
                                        }

                                        //tutaj potrzebnie przepisac id dokumenta w ktorym wystepuje dane slowo
                                        if (_document[k] != String.Empty | _document[k] != " " | _document[k] != null | _document[k] != Char.IsDigit(' ').ToString())
                                        {
                                            //dbContext.Terms_Vocabulary.Where(u)
                                            var termVocabularyTable = dbContext.Terms_Vocabulary;
                                            terms.term_value = _document[k];
                                        }
                                        umk_article.Terms_Vocabulary.Add(terms);
                                    }

                                    try
                                    {
                                        dbContext.SaveChanges();
                                    }
                                    catch (Exception ex)
                                    {
                                        File.WriteAllText(@"F:\\Magistry files\UMK_crawler_Log.txt", ex.ToString());
                                    }
                                }
                            }
                            else
                            {
                                continue;
                            }
                        }
                        else if (UMK_separatedContent.Length == 1 & (UMK_separatedContent[0].ToLower().Contains("http://") | UMK_separatedContent[0].ToLower().Contains("https://") | UMK_separatedContent[0].Contains("http://") | UMK_separatedContent[0].Contains("https://")))
                        {
                            UMK_Adres_URL = UMK_separatedContent[0];
                        }
                        else if (UMK_separatedContent.Length == 2 & (UMK_separatedContent[0].ToLower().Contains("aut.") | UMK_separatedContent[0].Contains("Aut.") | UMK_separatedContent[0] == "Aut."))
                        {
                            UMK_autors      = UMK_separatedContent[1].Split(separators, StringSplitOptions.RemoveEmptyEntries);
                            UMK_author_line = UMK_separatedContent[1];
                        }
                        else if (UMK_separatedContent.Length == 2 & (UMK_separatedContent[0].ToLower().Contains("Zapyt") | UMK_separatedContent[0].Contains("zapyt") | UMK_separatedContent[0] == "Zapytanie" | UMK_separatedContent[0] == "zapytanie"))
                        {
                            UMK_Zapytanie_Wyszukiwania = "SELECT * FROM UMK_Splendor_Expertus_article_database WHERE article LIKE " + UMK_separatedContent[1];
                        }
                        else if (UMK_separatedContent.Length == 2 & (UMK_separatedContent[0].Contains("Liczba odnalezionych") | UMK_separatedContent[0] == "Liczba odnalezionych rekordow"))
                        {
                            UMK_articles_Count = Convert.ToInt32(UMK_separatedContent[1]);
                            PP_articles_Matrix = new string[UMK_articles_Count];
                            for (int z = 0; z <= UMK_articles_Count - 1; z++)
                            {
                                PP_articles_Matrix[z] = (z + 1) + ".";
                            }
                        }
                        else if (UMK_separatedContent.Length >= 2 & (UMK_separatedContent[0].ToLower().Contains("tytu") | UMK_separatedContent[0].ToLower().Contains("tytuł") | UMK_separatedContent[0].ToLower().Contains("tytul") | UMK_separatedContent[0].Contains("TYTUŁ") | UMK_separatedContent[0] == "Tytuł" | UMK_separatedContent[0] == "Tytul"))
                        {
                            UMK_Tytul = UMK_separatedContent[1];
                        }
                        else if (UMK_separatedContent.Length >= 2 & (UMK_separatedContent[0].ToLower().Contains("opis wydawn.") | UMK_separatedContent[0].ToLower().Contains("opis wydawn") | UMK_separatedContent[0].Contains("Opis wydawn.") | UMK_separatedContent[0].Contains("Opis wydawn") | UMK_separatedContent[0] == "Opis wydawn."))
                        {
                            UMK_Opis_wydawn = UMK_separatedContent[1];
                        }
                        else if (UMK_separatedContent.Length == 2 & (UMK_separatedContent[0].ToLower().Contains("język") | UMK_separatedContent[0].ToLower().Contains("jezyk") | UMK_separatedContent[0].Contains("Język") | UMK_separatedContent[0].Contains("Jezyk") | UMK_separatedContent[0] == "Język" | UMK_separatedContent[0] == "Jezyk"))
                        {
                            UMK_Jezyk_Publikacji = UMK_separatedContent[1];
                        }
                        else if (UMK_separatedContent.Length == 2 & (UMK_separatedContent[0].ToLower().Contains("polskie słowa kluczowe") | UMK_separatedContent[0].ToLower().Contains("polskie slowa kluczowe") | UMK_separatedContent[0].Contains("Polskie słowa kluczowe") | UMK_separatedContent[0].Contains("Polskie slowa kluczowe") | UMK_separatedContent[0].Contains("Polskie slo") | UMK_separatedContent[0].Contains("polskie slo") | UMK_separatedContent[0] == "Polskie słowa kluczowe" | UMK_separatedContent[0] == "Polskie slowa kluczowe"))
                        {
                            UMK_Slowa_kluczowe_j_pl = UMK_separatedContent[1].Split(separators);
                            UMK_pl_keywords_line    = UMK_separatedContent[1];
                        }
                        else if (UMK_separatedContent.Length == 2 & (UMK_separatedContent[0].ToLower().Contains("tytuł wydawn. zbior.") | UMK_separatedContent[0].ToLower().Contains("tytul wydawn. zbior.") | UMK_separatedContent[0].Contains("Tytuł wydawn. zbior.") | UMK_separatedContent[0].Contains("Tytul wydawn. zbior.") | UMK_separatedContent[0] == "Tytuł wydawn. zbior." | UMK_separatedContent[0] == "Tytul wydawn. zbior."))
                        {
                            UMK_Tytul_Wydawn_Zbior = UMK_separatedContent[1];
                        }
                        else if ((UMK_separatedContent.Length == 2 & (UMK_separatedContent[0].ToLower().Contains("pełny tytuł czasop.") | UMK_separatedContent[0].ToLower().Contains("pelny tytul czasop.") | UMK_separatedContent[0].Contains("Pełny tytuł czasop.") | UMK_separatedContent[0].Contains("Pelny tytul czasop.") | UMK_separatedContent[0] == "Pełny tytuł czasop." | UMK_separatedContent[0] == "Pelny tytul czasop.")))
                        {
                            UMK_Pelny_tytul_czasop = UMK_separatedContent[1];
                        }
                        else if ((UMK_separatedContent.Length == 2 & (UMK_separatedContent[0].ToLower().Contains("tytuł równoległy") | UMK_separatedContent[0].ToLower().Contains("Tytul rownolegly") | UMK_separatedContent[0] == "Tytuł równoległy" | UMK_separatedContent[0] == "Tytul rownolegly")))
                        {
                            UMK_Tytul_rownolegly = UMK_separatedContent[1];
                        }
                        else if (UMK_separatedContent.Length == 2 & (UMK_separatedContent[0].ToLower().Contains("angielskie słowa kluczowe") | UMK_separatedContent[0].ToLower().Contains("angielskie slowa kluczowe") | UMK_separatedContent[0].Contains("Angielskie słowa kluczowe") | UMK_separatedContent[0].Contains("angielskie słowa kluczowe ") | UMK_separatedContent[0] == "Angielskie słowa kluczowe" | UMK_separatedContent[0] == "angielskie słowa kluczowe"))
                        {
                            UMK_Slowa_kluczowe_j_ang = UMK_separatedContent[1].Split(separators);
                            UMK_en_keywords_line     = UMK_separatedContent[1];
                        }
                    }
                }
            }
        }
Exemplo n.º 12
0
        //potrzebnie zaimplementowac divide and conquer dla duzych plikow

        public static void get_WSB_Document_content()
        {
            string[] WSB_newcontent       = new string[hapDoc.DocumentNode.InnerText.Length];
            string[] WSB_separatedContent = new string[hapDoc.DocumentNode.InnerText.Length];

            WSB_articles_Count = 0;
            string[] WSB_articles_Matrix = { String.Empty };

            using (StringReader sr = new StringReader(endText))
            {
                int    p = 0;
                string WSB_line;
                // 22.08.2018 New version of reader
                while ((WSB_line = sr.ReadLine()) != null)
                {
                    WSB_newcontent[p]    = WSB_line;
                    WSB_separatedContent = WSB_line.Split(line_separator, 2);
                    if (WSB_separatedContent.Length == 1 & WSB_separatedContent[0] == "")
                    {
                        continue;
                    }
                    else if (WSB_separatedContent.Length == 1 & WSB_articles_Matrix.Any(x => WSB_separatedContent[0].Contains(x)))
                    {
                        if (WSB_author_line != null & WSB_Tytul_pracy != null)
                        {
                            using (var dbContext = new ArticleDBDataModelContainer())
                            {
                                var document    = new StringBuilder();
                                var wsb_article = dbContext.WSB_ArticlesSet.Create();

                                if (WSB_author_line == null)
                                {
                                    WSB_author_line = "Not_defined";
                                }
                                wsb_article.article_authors = WSB_author_line;
                                WSB_author_line             = null;

                                if (WSB_Tytul_pracy == null)
                                {
                                    WSB_Tytul_pracy = "Not_defined";
                                }
                                wsb_article.article_title = WSB_Tytul_pracy;
                                if (WSB_Tytul_pracy != String.Empty | WSB_Tytul_pracy != " " | WSB_Tytul_pracy != null)
                                {
                                    var termTitle_WSB = TextPreparing.TermsPrepataions(WSB_Tytul_pracy);
                                    document.Append(termTitle_WSB);
                                }
                                WSB_Tytul_pracy = null;

                                if (WSB_Adres_wydawniczy == null)
                                {
                                    WSB_Adres_wydawniczy = "Not_defined";
                                }
                                wsb_article.article_publisher_adres = WSB_Adres_wydawniczy;
                                WSB_Adres_wydawniczy = null;

                                if (WSB_Tytul_calosci == null)
                                {
                                    WSB_Tytul_calosci = "Not_defined";
                                }
                                wsb_article.article_common_title = WSB_Tytul_calosci;
                                if (WSB_Tytul_calosci != String.Empty | WSB_Tytul_calosci != " " | WSB_Tytul_calosci != null)
                                {
                                    var termFullTitle_WSB = TextPreparing.TermsPrepataions(WSB_Tytul_calosci);
                                    document.Append(termFullTitle_WSB);
                                }
                                WSB_Tytul_calosci = null;

                                if (WSB_Slowa_kluczowe_j_pl_line == null)
                                {
                                    WSB_Slowa_kluczowe_j_pl_line = "Not_defined";
                                }
                                wsb_article.article_pl_keywords = WSB_Slowa_kluczowe_j_pl_line;
                                if (WSB_Slowa_kluczowe_j_pl_line != String.Empty | WSB_Slowa_kluczowe_j_pl_line != " " | WSB_Slowa_kluczowe_j_pl_line != null)
                                {
                                    var term_PL_Keywords_WSB = TextPreparing.TermsPrepataions(WSB_Slowa_kluczowe_j_pl_line);
                                    document.Append(term_PL_Keywords_WSB);
                                }
                                WSB_Slowa_kluczowe_j_pl_line = null;

                                if (WSB_Slowa_kluczowe_j_ang_line == null)
                                {
                                    WSB_Slowa_kluczowe_j_ang_line = "Not_defined";
                                }
                                wsb_article.article_eng_keywords = WSB_Slowa_kluczowe_j_ang_line;
                                if (WSB_Slowa_kluczowe_j_ang_line != String.Empty | WSB_Slowa_kluczowe_j_ang_line != " " | WSB_Slowa_kluczowe_j_ang_line != null)
                                {
                                    var term_Eng_Keywords_WSB = TextPreparing.TermsPrepataions(WSB_Slowa_kluczowe_j_ang_line);
                                    document.Append(term_Eng_Keywords_WSB);
                                }
                                WSB_Slowa_kluczowe_j_ang_line = null;

                                if (WSB_Tytul_pracy_w_innym_j == null)
                                {
                                    WSB_Tytul_pracy_w_innym_j = "Not_defined";
                                }
                                wsb_article.article_title_other_lang = WSB_Tytul_pracy_w_innym_j;
                                if (WSB_Tytul_pracy_w_innym_j != String.Empty | WSB_Tytul_pracy_w_innym_j != " " | WSB_Tytul_pracy_w_innym_j != null)
                                {
                                    var term_Title_Other_Lang_WSB = TextPreparing.TermsPrepataions(WSB_Tytul_pracy_w_innym_j);
                                    document.Append(term_Title_Other_Lang_WSB);
                                }
                                WSB_Tytul_pracy_w_innym_j = null;

                                if (WSB_Szczegoly == null)
                                {
                                    WSB_Szczegoly = "Not_defined";
                                }
                                wsb_article.article_details = WSB_Szczegoly;
                                WSB_Szczegoly = null;

                                if (WSB_URL == null)
                                {
                                    WSB_URL = "Not_defined";
                                }
                                wsb_article.article_URL = WSB_URL;
                                WSB_URL = null;

                                if (WSB_DOI == null)
                                {
                                    WSB_DOI = "Not_defined";
                                }
                                wsb_article.article_DOI = WSB_DOI;
                                WSB_DOI = null;
                                for (int k = 0; k <= WSB_autors.Length - 2;)
                                {
                                    var authors_of_the_article = dbContext.AuthorSet.Create();
                                    authors_of_the_article.author_name     = WSB_autors[k];
                                    authors_of_the_article.author_surename = WSB_autors[k + 1];
                                    wsb_article.Author.Add(authors_of_the_article);
                                    k += 2;
                                }
                                dbContext.WSB_ArticlesSet.Add(wsb_article);

                                var _document = document.ToString().Split(' ', ';', ':', ',');
                                for (int k = 0; k <= _document.Length - 1; k++)
                                {
                                    var      terms              = dbContext.Terms_Vocabulary.Create();
                                    string   dictionary_text    = File.ReadAllText(@"F:\Magistry files\csv_files\Allowed_term_dictionary.csv");
                                    string[] allowed_dictionary = dictionary_text.Split(',', '\n');

                                    for (int d = 0; d <= _document.Length - 1; d++)
                                    {
                                        for (int j = 0; j <= allowed_dictionary.Length - 1; j++)
                                        {
                                            if (_document[d].Length > 3 & _document[d].Contains(allowed_dictionary[j]))
                                            {
                                                continue;
                                            }
                                            else if (_document[d].Length <= 3 & !(_document[d].Contains(allowed_dictionary[j])))
                                            {
                                                _document.ToList().RemoveAt(d);
                                            }
                                        }
                                    }

                                    //tutaj potrzebnie przepisac id dokumenta w ktorym wystepuje dane slowo
                                    if (_document[k] != String.Empty | _document[k] != " " | _document[k] != null | _document[k] != Char.IsDigit(' ').ToString())
                                    {
                                        //dbContext.Terms_Vocabulary.Where(u)
                                        var termVocabularyTable = dbContext.Terms_Vocabulary;
                                        terms.term_value = _document[k];
                                    }
                                    wsb_article.Terms_Vocabulary.Add(terms);
                                }
                                try
                                {
                                    dbContext.SaveChanges();
                                }
                                catch (Exception ex)
                                {
                                    File.WriteAllText(@"F:\\Magistry files\WSB_crawler_Log.txt", ex.ToString());
                                }
                            }
                        }
                        else
                        {
                            continue;
                        }
                    }
                    else if (WSB_separatedContent.Length == 2 & (WSB_separatedContent[0].ToLower().Contains("autor") | WSB_separatedContent[0].Contains("Autor") | WSB_separatedContent[0] == "Autorzy"))
                    {
                        WSB_autors      = WSB_separatedContent[1].Split(autor_separators, StringSplitOptions.RemoveEmptyEntries);
                        WSB_author_line = WSB_separatedContent[1];
                    }
                    else if (WSB_separatedContent.Length == 2 & (WSB_separatedContent[0].ToLower().Contains("tytul pracy") | WSB_separatedContent[0].Contains("Tytul pracy") | WSB_separatedContent[0] == "Tytul pracy"))
                    {
                        WSB_Tytul_pracy = WSB_separatedContent[1];
                    }
                    else if (WSB_separatedContent.Length == 2 & (WSB_separatedContent[0].Contains("Liczba odnalezionych") | WSB_separatedContent[0] == "Liczba odnalezionych rekordow"))
                    {
                        WSB_articles_Count  = Convert.ToInt32(WSB_separatedContent[1]);
                        WSB_articles_Matrix = new string[WSB_articles_Count];
                        for (int z = 0; z <= WSB_articles_Count - 1; z++)
                        {
                            WSB_articles_Matrix[z] = (z + 1) + ".";
                        }
                    }
                    else if (WSB_separatedContent.Length == 2 & (WSB_separatedContent[0].ToLower().Contains("adres wydawniczy") | WSB_separatedContent[0].Contains("Adres wydawniczy") | WSB_separatedContent[0] == "Adres wydawniczy"))
                    {
                        WSB_Adres_wydawniczy = WSB_separatedContent[1];
                    }
                    else if (WSB_separatedContent.Length == 2 & (WSB_separatedContent[0].ToLower().Contains("polskie hasla") | WSB_separatedContent[0].Contains("Polskie hasla") | WSB_separatedContent[0] == "Polskie hasla przedmiotowe"))
                    {
                        WSB_Slowa_kluczowe_j_pl      = WSB_separatedContent[1].Split(separators, StringSplitOptions.RemoveEmptyEntries);
                        WSB_Slowa_kluczowe_j_pl_line = WSB_separatedContent[1];
                    }
                    else if (WSB_separatedContent.Length == 2 & (WSB_separatedContent[0].ToLower().Contains("angielskie hasla") | WSB_separatedContent[0].Contains("Angielskie hasla") | WSB_separatedContent[0] == "Angielskie hasla przedmiotowe"))
                    {
                        WSB_Slowa_kluczowe_j_ang      = WSB_separatedContent[1].Split(separators, StringSplitOptions.RemoveEmptyEntries);
                        WSB_Slowa_kluczowe_j_ang_line = WSB_separatedContent[1];
                    }
                    else if (WSB_separatedContent.Length == 2 & (WSB_separatedContent[0].ToLower().Contains("tytul calosci") | WSB_separatedContent[0].Contains("Tytul calosci") | WSB_separatedContent[0] == "Tytul calosci"))
                    {
                        WSB_Tytul_calosci = WSB_separatedContent[1];
                    }
                    else if (WSB_separatedContent.Length == 2 & (WSB_separatedContent[0].ToLower().Contains("doi") | WSB_separatedContent[0].Contains("DOI") | WSB_separatedContent[0] == "DOI"))
                    {
                        WSB_DOI = WSB_separatedContent[1];
                    }
                    else if (WSB_separatedContent.Length == 2 & (WSB_separatedContent[0].ToLower().Contains("tytul pracy w innym") | WSB_separatedContent[0].Contains("Tytul pracy w innym") | WSB_separatedContent[0] == "Tytul pracy w innym jezyku"))
                    {
                        WSB_Tytul_pracy_w_innym_j = WSB_separatedContent[1];
                    }
                    else if (WSB_separatedContent.Length == 2 & (WSB_separatedContent[0].ToLower().Contains("szczegoly") | WSB_separatedContent[0].Contains("Szczegoly") | WSB_separatedContent[0] == "Szczegoly"))
                    {
                        WSB_Szczegoly = WSB_separatedContent[1];
                    }
                    else if (WSB_separatedContent.Length == 2 & (WSB_separatedContent[0].ToLower().Contains("url") | WSB_separatedContent[0].Contains("Url") | WSB_separatedContent[0] == "Adres url"))
                    {
                        WSB_URL = WSB_separatedContent[1];
                    }
                    p++;
                }

                #region Old_iteration_method

                /* -- 21.08.2018 Old wersion of iteration
                 * for (int i = 0; i <= hapDoc.DocumentNode.InnerText.Length; i++)
                 * {
                 *  WSB_line = sr.ReadLine();
                 *  if (WSB_line != null)
                 *  {
                 *      WSB_newcontent[i] = WSB_line;
                 *      WSB_separatedContent = WSB_line.Split(line_separator, 2);
                 *
                 *
                 *      if (WSB_separatedContent.Length == 2 & (WSB_separatedContent[0].ToLower().Contains("autor") | WSB_separatedContent[0].Contains("Autor") | WSB_separatedContent[0] == "Autorzy"))
                 *      {
                 *          //System.Windows.MessageBox.Show(WSB_separatedContent[1]);
                 *          WSB_autors = WSB_separatedContent[1].Split(autor_separators, StringSplitOptions.RemoveEmptyEntries);
                 *          WSB_author_line = WSB_separatedContent[1];
                 *      }
                 *
                 *      else if (WSB_separatedContent.Length == 2 & (WSB_separatedContent[0].Contains("Liczba odnalezionych") | WSB_separatedContent[0] == "Liczba odnalezionych rekordow"))
                 *      {
                 *          WSB_articles_Count = Convert.ToInt32(WSB_separatedContent[1]);
                 *          WSB_articles_Matrix = new string[WSB_articles_Count];
                 *          for (int z = 0; z <= WSB_articles_Count - 1; z++)
                 *          {
                 *              WSB_articles_Matrix[z] = (z + 1) + ".";
                 *          }
                 *      }
                 *
                 *      else if (WSB_separatedContent.Length == 1 & WSB_articles_Matrix.Any(x => WSB_separatedContent[0].Contains(x)))
                 *      {
                 *          if (WSB_author_line != null & WSB_Tytul_pracy != null)
                 *          {
                 *              using(var dbContext = new ArticleDBDataModelContainer())
                 *              {
                 *                  var document = new StringBuilder();
                 *                  var wsb_article = dbContext.WSB_ArticlesSet.Create();
                 *
                 *                  if (WSB_author_line == null)
                 *                  {
                 *                      WSB_author_line = "Not_defined";
                 *                  }
                 *                  wsb_article.article_authors = WSB_author_line;
                 *                  WSB_author_line = null;
                 *
                 *                  if (WSB_Tytul_pracy == null)
                 *                  {
                 *                      WSB_Tytul_pracy = "Not_defined";
                 *                  }
                 *                  wsb_article.article_title = WSB_Tytul_pracy;
                 *                  if (WSB_Tytul_pracy != String.Empty | WSB_Tytul_pracy != " " | WSB_Tytul_pracy != null)
                 *                  {
                 *                      var termTitle_WSB = TextPreparing.TermsPrepataions(WSB_Tytul_pracy);
                 *                      document.Append(termTitle_WSB);
                 *                  }
                 *                  WSB_Tytul_pracy = null;
                 *
                 *                  if (WSB_Adres_wydawniczy == null)
                 *                  {
                 *                      WSB_Adres_wydawniczy = "Not_defined";
                 *                  }
                 *                  wsb_article.article_publisher_adres = WSB_Adres_wydawniczy;
                 *                  WSB_Adres_wydawniczy = null;
                 *
                 *                  if (WSB_Tytul_calosci == null)
                 *                  {
                 *                      WSB_Tytul_calosci = "Not_defined";
                 *                  }
                 *                  wsb_article.article_common_title = WSB_Tytul_calosci;
                 *                  if (WSB_Tytul_calosci != String.Empty | WSB_Tytul_calosci != " " | WSB_Tytul_calosci != null)
                 *                  {
                 *                      var termFullTitle_WSB = TextPreparing.TermsPrepataions(WSB_Tytul_calosci);
                 *                      document.Append(termFullTitle_WSB);
                 *                  }
                 *                  WSB_Tytul_calosci = null;
                 *
                 *                  if (WSB_Slowa_kluczowe_j_pl_line == null)
                 *                  {
                 *                      WSB_Slowa_kluczowe_j_pl_line = "Not_defined";
                 *                  }
                 *                  wsb_article.article_pl_keywords = WSB_Slowa_kluczowe_j_pl_line;
                 *                  if (WSB_Slowa_kluczowe_j_pl_line != String.Empty | WSB_Slowa_kluczowe_j_pl_line != " " | WSB_Slowa_kluczowe_j_pl_line != null)
                 *                  {
                 *                      var term_PL_Keywords_WSB = TextPreparing.TermsPrepataions(WSB_Slowa_kluczowe_j_pl_line);
                 *                      document.Append(term_PL_Keywords_WSB);
                 *                  }
                 *                  WSB_Slowa_kluczowe_j_pl_line = null;
                 *
                 *                  if (WSB_Slowa_kluczowe_j_ang_line == null)
                 *                  {
                 *                      WSB_Slowa_kluczowe_j_ang_line = "Not_defined";
                 *                  }
                 *                  wsb_article.article_eng_keywords = WSB_Slowa_kluczowe_j_ang_line;
                 *                  if (WSB_Slowa_kluczowe_j_ang_line != String.Empty | WSB_Slowa_kluczowe_j_ang_line != " " | WSB_Slowa_kluczowe_j_ang_line != null)
                 *                  {
                 *                      var term_Eng_Keywords_WSB = TextPreparing.TermsPrepataions(WSB_Slowa_kluczowe_j_ang_line);
                 *                      document.Append(term_Eng_Keywords_WSB);
                 *                  }
                 *                  WSB_Slowa_kluczowe_j_ang_line = null;
                 *
                 *                  if (WSB_Tytul_pracy_w_innym_j == null)
                 *                  {
                 *                      WSB_Tytul_pracy_w_innym_j = "Not_defined";
                 *                  }
                 *                  wsb_article.article_title_other_lang = WSB_Tytul_pracy_w_innym_j;
                 *                  if (WSB_Tytul_pracy_w_innym_j != String.Empty | WSB_Tytul_pracy_w_innym_j != " " | WSB_Tytul_pracy_w_innym_j != null)
                 *                  {
                 *                      var term_Title_Other_Lang_WSB = TextPreparing.TermsPrepataions(WSB_Tytul_pracy_w_innym_j);
                 *                      document.Append(term_Title_Other_Lang_WSB);
                 *                  }
                 *                  WSB_Tytul_pracy_w_innym_j = null;
                 *
                 *                  if (WSB_Szczegoly == null)
                 *                  {
                 *                      WSB_Szczegoly = "Not_defined";
                 *                  }
                 *                  wsb_article.article_details = WSB_Szczegoly;
                 *                  WSB_Szczegoly = null;
                 *
                 *                  if (WSB_URL == null)
                 *                  {
                 *                      WSB_URL = "Not_defined";
                 *                  }
                 *                  wsb_article.article_URL = WSB_URL;
                 *                  WSB_URL = null;
                 *
                 *                  if (WSB_DOI == null)
                 *                  {
                 *                      WSB_DOI = "Not_defined";
                 *                  }
                 *                  wsb_article.article_DOI = WSB_DOI;
                 *                  WSB_DOI = null;
                 *
                 *
                 *                  for (int k = 0; k <= WSB_autors.Length - 2;)
                 *                  {
                 *                      var authors_of_the_article = dbContext.AuthorSet.Create();
                 *                      authors_of_the_article.author_name = WSB_autors[k];
                 *                      authors_of_the_article.author_surename = WSB_autors[k + 1];
                 *                      wsb_article.Author.Add(authors_of_the_article);
                 *                      k += 2;
                 *                  }
                 *
                 *                   dbContext.WSB_ArticlesSet.Add(wsb_article);
                 *
                 *                  var _document = document.ToString().Split(' ', ';', ':', ',');
                 *                  for (int k = 0; k <= _document.Length - 1; k++)
                 *                  {
                 *                      var terms = dbContext.Terms_Vocabulary.Create();
                 *                      //
                 *                      string dictionary_text = File.ReadAllText(@"F:\Magistry files\csv_files\Allowed_term_dictionary.csv");
                 *                      string[] allowed_dictionary = dictionary_text.Split(',', '\n');
                 *
                 *                      for (int p = 0; p <= _document.Length - 1; p++)
                 *                      {
                 *                          for (int j = 0; j <= allowed_dictionary.Length - 1; j++)
                 *                          {
                 *                              if (_document[p].Length > 3 & _document[p].Contains(allowed_dictionary[j]))
                 *                              {
                 *                                  continue;
                 *                              }
                 *                              else if (_document[p].Length <= 3 & !(_document[p].Contains(allowed_dictionary[j])))
                 *                              {
                 *                                  _document.ToList().RemoveAt(p);
                 *                              }
                 *
                 *                          }
                 *                      }
                 *
                 *                      //tutaj potrzebnie przepisac id dokumenta w ktorym wystepuje dane slowo
                 *                      if (_document[k] != String.Empty | _document[k] != " " | _document[k] != null | _document[k] != Char.IsDigit(' ').ToString())
                 *                      {
                 *                          //dbContext.Terms_Vocabulary.Where(u)
                 *                          var termVocabularyTable = dbContext.Terms_Vocabulary;
                 *                          terms.term_value = _document[k];
                 *
                 *                      }
                 *                      wsb_article.Terms_Vocabulary.Add(terms);
                 *                  }
                 *                  try
                 *                  {
                 *                      dbContext.SaveChanges();
                 *                  }
                 *                  catch (Exception ex)
                 *                  {
                 *                      File.WriteAllText(@"F:\\Magistry files\WSB_crawler_Log.txt", ex.ToString());
                 *                  }
                 *
                 *              }
                 *          }
                 *
                 *          else
                 *          {
                 *              //return;
                 *              //System.Windows.MessageBox.Show("brak danych!");
                 *              //File.WriteAllText(@"F:\\Magistry files\WSB_emptyLines.txt", "empty_line");
                 *              continue;
                 *          }
                 *
                 *      }
                 *
                 *      else if (WSB_separatedContent.Length == 2 & (WSB_separatedContent[0].ToLower().Contains("tytul pracy") | WSB_separatedContent[0].Contains("Tytul pracy") | WSB_separatedContent[0] == "Tytul pracy"))
                 *      {
                 *          //System.Windows.MessageBox.Show(WSB_separatedContent[1]);
                 *          WSB_Tytul_pracy = WSB_separatedContent[1];
                 *      }
                 *
                 *      else if (WSB_separatedContent.Length == 2 & (WSB_separatedContent[0].ToLower().Contains("adres wydawniczy") | WSB_separatedContent[0].Contains("Adres wydawniczy") | WSB_separatedContent[0] == "Adres wydawniczy"))
                 *      {
                 *          //System.Windows.MessageBox.Show(WSB_separatedContent[1]);
                 *          WSB_Adres_wydawniczy = WSB_separatedContent[1];
                 *      }
                 *
                 *
                 *      else if (WSB_separatedContent.Length == 2 & (WSB_separatedContent[0].ToLower().Contains("polskie hasla") | WSB_separatedContent[0].Contains("Polskie hasla") | WSB_separatedContent[0] == "Polskie hasla przedmiotowe"))
                 *      {
                 *          //System.Windows.MessageBox.Show(WSB_separatedContent[1]);
                 *          WSB_Slowa_kluczowe_j_pl = WSB_separatedContent[1].Split(separators, StringSplitOptions.RemoveEmptyEntries);
                 *          WSB_Slowa_kluczowe_j_pl_line = WSB_separatedContent[1];
                 *      }
                 *
                 *      else if (WSB_separatedContent.Length == 2 & (WSB_separatedContent[0].ToLower().Contains("angielskie hasla") | WSB_separatedContent[0].Contains("Angielskie hasla") | WSB_separatedContent[0] == "Angielskie hasla przedmiotowe"))
                 *      {
                 *          //System.Windows.MessageBox.Show(WSB_separatedContent[1]);
                 *          WSB_Slowa_kluczowe_j_ang = WSB_separatedContent[1].Split(separators, StringSplitOptions.RemoveEmptyEntries);
                 *          WSB_Slowa_kluczowe_j_ang_line = WSB_separatedContent[1];
                 *      }
                 *
                 *      else if (WSB_separatedContent.Length == 2 & (WSB_separatedContent[0].ToLower().Contains("tytul calosci") | WSB_separatedContent[0].Contains("Tytul calosci") | WSB_separatedContent[0] == "Tytul calosci"))
                 *      {
                 *          //System.Windows.MessageBox.Show(WSB_separatedContent[1]);
                 *          WSB_Tytul_calosci = WSB_separatedContent[1];
                 *      }
                 *
                 *      else if (WSB_separatedContent.Length == 2 & (WSB_separatedContent[0].ToLower().Contains("doi") | WSB_separatedContent[0].Contains("DOI") | WSB_separatedContent[0] == "DOI"))
                 *      {
                 *          //System.Windows.MessageBox.Show(WSB_separatedContent[1]);
                 *          WSB_DOI = WSB_separatedContent[1];
                 *      }
                 *
                 *      else if (WSB_separatedContent.Length == 2 & (WSB_separatedContent[0].ToLower().Contains("tytul pracy w innym") | WSB_separatedContent[0].Contains("Tytul pracy w innym") | WSB_separatedContent[0] == "Tytul pracy w innym jezyku"))
                 *      {
                 *          //System.Windows.MessageBox.Show(WSB_separatedContent[1]);
                 *          WSB_Tytul_pracy_w_innym_j = WSB_separatedContent[1];
                 *      }
                 *
                 *      else if (WSB_separatedContent.Length == 2 & (WSB_separatedContent[0].ToLower().Contains("szczegoly") | WSB_separatedContent[0].Contains("Szczegoly") | WSB_separatedContent[0] == "Szczegoly"))
                 *      {
                 *         //System.Windows.MessageBox.Show(WSB_separatedContent[1]);
                 *          WSB_Szczegoly = WSB_separatedContent[1];
                 *      }
                 *
                 *      else if (WSB_separatedContent.Length == 2 & (WSB_separatedContent[0].ToLower().Contains("url") | WSB_separatedContent[0].Contains("Url") | WSB_separatedContent[0] == "Adres url"))
                 *      {
                 *          //System.Windows.MessageBox.Show(WSB_separatedContent[1]);
                 *          WSB_URL = WSB_separatedContent[1];
                 *      }
                 *
                 *      //else if (PP_separatedContent.Length == 1 & PP_separatedContent[0] == String.Empty) System.Windows.MessageBox.Show("The empty line detected", "Empty line", System.Windows.MessageBoxButton.OK);
                 *      //else System.Windows.MessageBox.Show("Error! Content not found!", "Error!", System.Windows.MessageBoxButton.OK);
                 *  }
                 * }
                 */
                #endregion
            }
        }
Exemplo n.º 13
0
        public static void GenerateArticlesToCSVandJsonFromDB(string articlesCSV, string articlesJson)
        {
            string csvContent  = string.Empty;
            string jsonContent = "var articles = [";

            csvContent += articleHeaderCSV;
            using (var PG_dbcontext = new ArticleDBDataModelContainer())
            {
                var resul_PG = PG_dbcontext.PG_ArticlesSet.SqlQuery("SELECT * FROM dbo.PG_ArticlesSet").ToList();
                if (resul_PG != null)
                {
                    foreach (var item in resul_PG)
                    {
                        ArticlesJsonObj articlesJsonObj = new ArticlesJsonObj(item.article_Id, item.title, item.abstractText, item.keywords, item.year.ToString(), item.authors, item.url);
                        csvContent += ("\"" + item.article_Id + "\",") +
                                      ("\"" + item.title + "\",") +
                                      ("\"" + item.abstractText + "\",") +
                                      ("\"" + item.keywords + "\",") +
                                      ("\"" + item.year + "\",") +
                                      ("\"" + item.authors + "\",") +
                                      ("\"" + item.url + "\"") + '\n';
                        jsonContent += JsonConvert.SerializeObject(articlesJsonObj) + '\n';
                    }
                }
            }
            using (var PP_dbcontext = new ArticleDBDataModelContainer())
            {
                var resul_PP = PP_dbcontext.PP_ArticlesSet.SqlQuery("SELECT * FROM dbo.PP_ArticlesSet").ToList();
                if (resul_PP != null)
                {
                    foreach (var item in resul_PP)
                    {
                        ArticlesJsonObj articlesJsonObj = new ArticlesJsonObj(item.article_Id, item.article_title, string.Empty, string.Empty, item.article_year.ToString(), item.article_author_line, item.article_DOI);
                        csvContent += ("\"" + item.article_Id + "\",") +
                                      ("\"" + item.article_title + "\",") +
                                      ("\"" + "" + "\",") +
                                      ("\"" + "" + "\",") +
                                      ("\"" + item.article_year + "\",") +
                                      ("\"" + item.article_author_line + "\",") +
                                      ("\"" + item.article_DOI + "\"") + '\n';
                        jsonContent += JsonConvert.SerializeObject(articlesJsonObj) + '\n';
                    }
                }
            }
            using (var UG_dbcontext = new ArticleDBDataModelContainer())
            {
                var resul_UG = UG_dbcontext.UG_ArticlesSet.SqlQuery("SELECT * FROM dbo.UG_ArticlesSet").ToList();
                if (resul_UG != null)
                {
                    foreach (var item in resul_UG)
                    {
                        ArticlesJsonObj articlesJsonObj = new ArticlesJsonObj(item.article_Id, item.article_title, string.Empty, item.article_keywords, string.Empty, item.article_author_line, item.article_DOI);
                        csvContent += ("\"" + item.article_Id + "\",") +
                                      ("\"" + item.article_title + "\",") +
                                      ("\"" + "" + "\",") +
                                      ("\"" + item.article_keywords + "\",") +
                                      ("\"" + "" + "\",") +
                                      ("\"" + item.article_author_line + "\",") +
                                      ("\"" + item.article_DOI + "\"") + '\n';
                        jsonContent += JsonConvert.SerializeObject(articlesJsonObj) + '\n';
                    }
                }
            }
            using (var UMK_dbcontext = new ArticleDBDataModelContainer())
            {
                var resul_UMK = UMK_dbcontext.UMK_ArticlesSet.SqlQuery("SELECT * FROM dbo.UMK_ArticlesSet").ToList();
                if (resul_UMK != null)
                {
                    foreach (var item in resul_UMK)
                    {
                        ArticlesJsonObj articlesJsonObj = new ArticlesJsonObj(item.article_Id, item.article_title + " " + item.article_Full_title + " " + item.article_translated_title, string.Empty, item.article_pl_keywords + " " + item.article_eng_keywords, string.Empty, item.article_author_line, item.article_url);
                        csvContent += ("\"" + item.article_Id + "\",") +
                                      ("\"" + item.article_title + " " + item.article_Full_title + " " + item.article_translated_title + "\",") +
                                      ("\"" + "" + "\",") +
                                      ("\"" + item.article_pl_keywords + " " + item.article_eng_keywords + "\",") +
                                      ("\"" + "" + "\",") +
                                      ("\"" + item.article_author_line + "\",") +
                                      ("\"" + item.article_url + "\"") + '\n';
                        jsonContent += JsonConvert.SerializeObject(articlesJsonObj) + '\n';
                    }
                }
            }
            using (var WSB_dbcontext = new ArticleDBDataModelContainer())
            {
                var resul_WSB = WSB_dbcontext.WSB_ArticlesSet.SqlQuery("SELECT * FROM dbo.WSB_ArticlesSet").ToList();
                if (resul_WSB != null)
                {
                    foreach (var item in resul_WSB)
                    {
                        ArticlesJsonObj articlesJsonObj = new ArticlesJsonObj(item.article_Id, item.article_title + " " + item.article_common_title + " " + item.article_title_other_lang, string.Empty, item.article_pl_keywords + " " + item.article_eng_keywords, string.Empty, item.article_authors, item.article_URL);
                        csvContent += ("\"" + item.article_Id + "\",") +
                                      ("\"" + item.article_title + " " + item.article_common_title + " " + item.article_title_other_lang + "\",") +
                                      ("\"" + " " + "\",") +
                                      ("\"" + item.article_pl_keywords + " " + item.article_eng_keywords + "\",") +
                                      ("\"" + " " + "\",") +
                                      ("\"" + item.article_authors + "\",") +
                                      ("\"" + item.article_URL + "\"") + '\n';
                        jsonContent += JsonConvert.SerializeObject(articlesJsonObj) + '\n';
                    }
                }
            }
            jsonContent += "]";
            using (StreamWriter csv_SW = File.AppendText(articlesCSV))
            {
                csv_SW.Write(csvContent);
            }
            using (StreamWriter json_SW = File.AppendText(articlesJson))
            {
                json_SW.Write(jsonContent);
            }
        }
Exemplo n.º 14
0
 private static DbSet <Terms_Vocabulary> GetTerms_Vocabulary(ArticleDBDataModelContainer dbcon)
 {
     return(dbcon.Terms_Vocabulary);
 }
Exemplo n.º 15
0
        public static void get_UG_Document_content()
        {
            string[] UG_newcontent       = new string[hapDoc.DocumentNode.InnerText.Length];
            string[] UG_separatedContent = new string[hapDoc.DocumentNode.InnerText.Length];

            UG_articles_Count = 0;
            string[] UG_articles_Matrix = { String.Empty };

            using (StringReader sr = new StringReader(endText))
            {
                int    p = 0;
                string UG_line;

                while ((UG_line = sr.ReadLine()) != null)
                {
                    UG_newcontent[p]    = UG_line;
                    UG_separatedContent = UG_line.Split(line_separator, 2);

                    if (UG_separatedContent.Length == 1 & UG_separatedContent[0] == "")
                    {
                        continue;
                    }
                    else if (UG_separatedContent.Length == 1 && UG_articles_Matrix.Any(x => UG_separatedContent[0].Contains(x)))
                    {
                        if (UG_author_line != null && UG_Tytul != null)
                        {
                            using (var dbContext = new ArticleDBDataModelContainer())
                            {
                                var document   = new StringBuilder();
                                var ug_article = dbContext.UG_ArticlesSet.Create();

                                ug_article.article_author_line = UG_author_line;
                                UG_author_line = null;

                                ug_article.article_keywords = UG_slowa_kluczowe_j_ang_line;
                                if (UG_slowa_kluczowe_j_ang_line != String.Empty || UG_slowa_kluczowe_j_ang_line != " " || UG_slowa_kluczowe_j_ang_line != null)
                                {
                                    var termEngKeywords = TextPreparing.TermsPrepataions(UG_slowa_kluczowe_j_ang_line);
                                    document.Append(termEngKeywords);
                                }
                                UG_slowa_kluczowe_j_ang_line = null;

                                ug_article.article_source = UG_Zrodlo;
                                UG_Zrodlo = null;

                                ug_article.article_title = UG_Tytul;
                                if (UG_Tytul != String.Empty || UG_Tytul != " " || UG_Tytul != null)
                                {
                                    var term_UG_Title = TextPreparing.TermsPrepataions(UG_Tytul);
                                    document.Append(term_UG_Title);
                                }
                                UG_Tytul = null;

                                ug_article.article_DOI = UG_DOI;
                                UG_DOI = null;

                                for (int k = 0; k <= UG_autors.Length - 2;)
                                {
                                    var authors_of_the_article = dbContext.AuthorSet.Create();
                                    authors_of_the_article.author_name     = UG_autors[k];
                                    authors_of_the_article.author_surename = UG_autors[k + 1];
                                    ug_article.Author.Add(authors_of_the_article);
                                    k += 2;
                                }

                                dbContext.UG_ArticlesSet.Add(ug_article);

                                var _document = document.ToString().Split(' ', ';', ':', ',');
                                for (int k = 0; k <= _document.Length - 1; k++)
                                {
                                    var      terms              = dbContext.Terms_Vocabulary.Create();
                                    string   dictionary_text    = File.ReadAllText(@"F:\Magistry files\csv_files\Allowed_term_dictionary.csv");
                                    string[] allowed_dictionary = dictionary_text.Split(',', '\n');

                                    for (int d = 0; d <= _document.Length - 1; d++)
                                    {
                                        for (int j = 0; j <= allowed_dictionary.Length - 1; j++)
                                        {
                                            if (_document[d].Length > 3 && _document[d].Contains(allowed_dictionary[j]))
                                            {
                                                continue;
                                            }
                                            else if (_document[d].Length <= 3 && !(_document[d].Contains(allowed_dictionary[j])))
                                            {
                                                _document.ToList().RemoveAt(d);
                                            }
                                        }
                                    }

                                    //tutaj potrzebnie przepisac id dokumenta w ktorym wystepuje dane slowo
                                    if (_document[k] != String.Empty || _document[k] != " " || _document[k] != null || _document[k] != Char.IsDigit(' ').ToString() || dbContext.Terms_Vocabulary.Any(o => o.term_value != _document[k]))
                                    {
                                        //dbContext.Terms_Vocabulary.Where(u)
                                        var termVocabularyTable = dbContext.Terms_Vocabulary;
                                        terms.term_value = _document[k];
                                    }
                                    try
                                    {
                                        ug_article.Terms_Vocabulary.Add(terms);
                                    }
                                    catch (Exception addingTermToDB)
                                    {
                                        File.WriteAllText(@"F:\\Magistry files\UG_crawler_Log.txt", DateTime.Now.ToString() + addingTermToDB.ToString());
                                    }
                                }
                                try
                                {
                                    dbContext.SaveChanges();
                                }
                                catch (Exception ex)
                                {
                                    File.WriteAllText(@"F:\\Magistry files\UG_crawler_Log.txt", DateTime.Now.ToString() + ex.ToString());
                                }
                            }
                        }
                        else
                        {
                            File.WriteAllText(@"F:\\Magistry files\UG_crawler_Log.txt", "Empty line detected." + '\n');
                        }
                    }
                    else if (UG_separatedContent.Length == 2 && (UG_separatedContent[0].Contains("Liczba odnalezionych") || UG_separatedContent[0] == "Liczba odnalezionych rekordow"))
                    {
                        UG_articles_Count  = Convert.ToInt32(UG_separatedContent[1]);
                        UG_articles_Matrix = new string[UG_articles_Count];
                        for (int z = 0; z <= UG_articles_Count - 1; z++)
                        {
                            UG_articles_Matrix[z] = (z + 1) + ".";
                        }
                    }
                    else if (UG_separatedContent.Length == 2 && UG_separatedContent[0].ToLower().Contains("autorzy"))
                    {
                        UG_author_line = UG_separatedContent[1];
                        UG_autors      = UG_separatedContent[1].Split(autor_separators, StringSplitOptions.RemoveEmptyEntries);
                    }
                    else if (UG_separatedContent.Length == 2 && (UG_separatedContent[0].ToLower().Contains("tytu") || UG_separatedContent[0].ToLower().Contains("tytul") || UG_separatedContent[0].Contains("TYTUL") || UG_separatedContent[0] == "TYTUL[ROZDZIALU, FRAGMENTU]" || UG_separatedContent[0].Contains("TYTUL[ROZDZIALU, FRAGMENTU]") || UG_separatedContent[0].ToLower().Contains("TYTUL[ROZDZIALU, FRAGMENTU]")))
                    {
                        UG_Tytul = UG_separatedContent[1];
                    }
                    else if (UG_separatedContent.Length == 2 && UG_separatedContent[0].ToLower().Contains("zrodlo"))
                    {
                        UG_Zrodlo = UG_separatedContent[1];
                    }
                    else if (UG_separatedContent.Length == 2 && UG_separatedContent[0].Contains("Slowa kluczowe w j. ang."))
                    {
                        UG_Slowa_kluczowe_j_ang      = UG_separatedContent[1].Split(separators);
                        UG_slowa_kluczowe_j_ang_line = UG_separatedContent[1];
                    }
                    else if (UG_separatedContent.Length == 2 && (UG_separatedContent[0] == "DOI" || UG_separatedContent.Contains("DOI") || UG_separatedContent[0].ToLower().Contains("doi")))
                    {
                        UG_DOI = UG_separatedContent[1];
                    }
                    p++;
                }
                #region Old_reader_code
                // 21.08.2018 - Old version of code

                /*
                 * for (int i = 0; i <= hapDoc.DocumentNode.InnerText.Length; i++)
                 * {
                 *
                 *  UG_line = sr.ReadLine();
                 *  if (UG_line != null)
                 *  {
                 *      UG_newcontent[i] = UG_line;
                 *      UG_separatedContent = UG_line.Split(line_separator, 2);
                 *
                 *      if (UG_separatedContent.Length == 2 && UG_separatedContent[0].ToLower().Contains("autorzy"))
                 *      {
                 *          UG_author_line = UG_separatedContent[1];
                 *          UG_autors = UG_separatedContent[1].Split(autor_separators, StringSplitOptions.RemoveEmptyEntries);
                 *
                 *      }
                 *      else if (UG_separatedContent.Length == 2 && (UG_separatedContent[0].Contains("Liczba odnalezionych") || UG_separatedContent[0] == "Liczba odnalezionych rekordow"))
                 *      {
                 *          UG_articles_Count = Convert.ToInt32(UG_separatedContent[1]);
                 *          UG_articles_Matrix = new string[UG_articles_Count];
                 *          for (int z = 0; z <= UG_articles_Count - 1; z++)
                 *          {
                 *              UG_articles_Matrix[z] = (z + 1) + ".";
                 *          }
                 *      }
                 *      else if (UG_separatedContent.Length == 1 && UG_articles_Matrix.Any(x => UG_separatedContent[0].Contains(x)))
                 *      {
                 *          if (UG_author_line != null && UG_Tytul != null)
                 *          {
                 *              using(var dbContext = new ArticleDBDataModelContainer())
                 *              {
                 *                  var document = new StringBuilder();
                 *                  var ug_article = dbContext.UG_ArticlesSet.Create();
                 *
                 *                  ug_article.article_author_line = UG_author_line;
                 *                  UG_author_line = null;
                 *
                 *                  ug_article.article_keywords = UG_slowa_kluczowe_j_ang_line;
                 *                  if (UG_slowa_kluczowe_j_ang_line != String.Empty || UG_slowa_kluczowe_j_ang_line != " " || UG_slowa_kluczowe_j_ang_line != null)
                 *                  {
                 *                      var termEngKeywords = TextPreparing.TermsPrepataions(UG_slowa_kluczowe_j_ang_line);
                 *                      document.Append(termEngKeywords);
                 *                  }
                 *                  UG_slowa_kluczowe_j_ang_line = null;
                 *
                 *                  ug_article.article_source = UG_Zrodlo;
                 *                  UG_Zrodlo = null;
                 *
                 *                  ug_article.article_title = UG_Tytul;
                 *                  if (UG_Tytul != String.Empty || UG_Tytul != " " || UG_Tytul != null)
                 *                  {
                 *                      var term_UG_Title = TextPreparing.TermsPrepataions(UG_Tytul);
                 *                      document.Append(term_UG_Title);
                 *                  }
                 *                  UG_Tytul = null;
                 *
                 *                  ug_article.article_DOI = UG_DOI;
                 *                  UG_DOI = null;
                 *
                 *                  for (int k = 0; k <= UG_autors.Length - 2;)
                 *                  {
                 *                      var authors_of_the_article = dbContext.AuthorSet.Create();
                 *                      authors_of_the_article.author_name = UG_autors[k];
                 *                      authors_of_the_article.author_surename = UG_autors[k + 1];
                 *                      ug_article.Author.Add(authors_of_the_article);
                 *                      k += 2;
                 *                  }
                 *                  dbContext.UG_ArticlesSet.Add(ug_article);
                 *                  var _document = document.ToString().Split(' ', ';', ':', ',');
                 *                  for (int k = 0; k <= _document.Length - 1; k++)
                 *                  {
                 *                      var terms = dbContext.Terms_Vocabulary.Create();
                 *                      string dictionary_text = File.ReadAllText(@"F:\Magistry files\csv_files\Allowed_term_dictionary.csv");
                 *                      string[] allowed_dictionary = dictionary_text.Split(',', '\n');
                 *
                 *                      for (int d = 0; d <= _document.Length - 1; d++)
                 *                      {
                 *                          for (int j = 0; j <= allowed_dictionary.Length - 1; j++)
                 *                              if (_document[d].Length > 3 && _document[d].Contains(allowed_dictionary[j]))
                 *                                  continue;
                 *                              else if (_document[d].Length <= 3 && !(_document[d].Contains(allowed_dictionary[j])))
                 *                                  _document.ToList().RemoveAt(d);
                 *                      }
                 *
                 *                      //tutaj potrzebnie przepisac id dokumenta w ktorym wystepuje dane slowo
                 *                      if (_document[k] != String.Empty || _document[k] != " " || _document[k] != null || _document[k] != Char.IsDigit(' ').ToString())
                 *                      {
                 *                          //dbContext.Terms_Vocabulary.Where(u)
                 *                          var termVocabularyTable = dbContext.Terms_Vocabulary;
                 *                          terms.term_value = _document[k];
                 *
                 *                      }
                 *                      ug_article.Terms_Vocabulary.Add(terms);
                 *                  }
                 *                  try
                 *                  {
                 *                      dbContext.SaveChanges();
                 *                  }
                 *                  catch(Exception ex)
                 *                  {
                 *                      File.WriteAllText(@"F:\\Magistry files\UG_crawler_Log.txt", ex.ToString());
                 *                  }
                 *              }
                 *
                 *              ///<summary>
                 *              /// UGArticle_Entity_Object_Creation
                 *              /// </summary>
                 #region UGArticle_Entity_Object_Creation
                 *              using (var db = new PublicationsContext())
                 *              {
                 *                  var ug_article = new UGArticle();
                 *                  ug_article.article_author_line = UG_author_line;
                 *                  UG_author_line = null;
                 *                  ug_article.article_keywords = UG_slowa_kluczowe_j_ang_line;
                 *                  UG_slowa_kluczowe_j_ang_line = null;
                 *                  ug_article.article_source = UG_Zrodlo;
                 *                  UG_Zrodlo = null;
                 *                  ug_article.article_title = UG_Tytul;
                 *                  UG_Tytul = null;
                 *                  ug_article.article_DOI = UG_DOI;
                 *                  UG_DOI = null;
                 *
                 *                  var authors_of_the_article = new Authors();
                 *                  for (int k = 0; k <= UG_autors.Length - 2; k++)
                 *                  {
                 *                      authors_of_the_article.author_name = UG_autors[k];
                 *                      authors_of_the_article.author_surename = UG_autors[k + 1];
                 *                      authors_of_the_article.article_Id = ug_article.article_Id;
                 *
                 *                      db.Authors.Add(authors_of_the_article);
                 *                  }
                 *
                 *                  //authors_of_the_article.UG_Articles.Add(ug_article);
                 *                  db.UG_Articles.Add(ug_article);
                 *                  db.SaveChanges();
                 *              }
                 *          else
                 *          {
                 *              File.WriteAllText(@"F:\\Magistry files\UG_crawler_Log.txt", "Empty line detected."+'\n');
                 *          }
                 *      }
                 *      else if (UG_separatedContent.Length == 2 && (UG_separatedContent[0].ToLower().Contains("tytu") || UG_separatedContent[0].ToLower().Contains("tytul") || UG_separatedContent[0].Contains("TYTUL") || UG_separatedContent[0]=="TYTUL[ROZDZIALU, FRAGMENTU]" || UG_separatedContent[0].Contains("TYTUL[ROZDZIALU, FRAGMENTU]") || UG_separatedContent[0].ToLower().Contains("TYTUL[ROZDZIALU, FRAGMENTU]")))
                 *      {
                 *          UG_Tytul = UG_separatedContent[1];
                 *      }
                 *      else if (UG_separatedContent.Length == 2 && UG_separatedContent[0].ToLower().Contains("zrodlo")){
                 *          UG_Zrodlo = UG_separatedContent[1];
                 *      }
                 *      else if (UG_separatedContent.Length == 2 && UG_separatedContent[0].Contains("Slowa kluczowe w j. ang."))
                 *      {
                 *          UG_Slowa_kluczowe_j_ang = UG_separatedContent[1].Split(separators);
                 *          UG_slowa_kluczowe_j_ang_line = UG_separatedContent[1];
                 *      }
                 *      else if (UG_separatedContent.Length == 2 && (UG_separatedContent[0] == "DOI" || UG_separatedContent.Contains("DOI") || UG_separatedContent[0].ToLower().Contains("doi")))
                 *      {
                 *          UG_DOI = UG_separatedContent[1];
                 *      }
                 *  }
                 * }
                 */
                #endregion
            }
        }
Exemplo n.º 16
0
        public static List <string> GenerateCollection()
        {
            List <string> DocumentCollection = new List <string>();

            Stopwatch database_processing = Stopwatch.StartNew();

            database_processing.Start();
            using (var dbContext = new ArticleDBDataModelContainer())
            {
                var resul_PG = dbContext.PG_ArticlesSet.SqlQuery("SELECT * FROM dbo.PG_ArticlesSet").ToList();
                if (resul_PG != null)
                {
                    foreach (var item in resul_PG)
                    {
                        if (item.title != null || item.title != String.Empty || item.abstractText != null || item.abstractText != String.Empty || item.keywords != null || item.keywords != String.Empty)
                        {
                            DocumentCollection.Add(item.title.ToLower() + item.abstractText.ToLower() + item.keywords.ToLower());
                        }
                    }
                }

                var result_PP = dbContext.PP_ArticlesSet.SqlQuery("SELECT * FROM dbo.PP_ArticlesSet").ToList();
                if (result_PP != null)
                {
                    foreach (var PP_item in result_PP)
                    {
                        if (PP_item.article_title != null || PP_item.article_title != String.Empty || PP_item.article_source != null || PP_item.article_source != String.Empty)
                        {
                            DocumentCollection.Add(PP_item.article_title.ToLower() + PP_item.article_source.ToLower());
                        }
                    }
                }

                var result_UG = dbContext.UG_ArticlesSet.SqlQuery("SELECT * FROM UG_ArticlesSet").ToList();
                if (result_UG != null)
                {
                    foreach (var UG_item in result_UG)
                    {
                        if (UG_item.article_title != null || UG_item.article_title != String.Empty || UG_item.article_keywords != null || UG_item.article_keywords != String.Empty)
                        {
                            DocumentCollection.Add(UG_item.article_title.ToLower() + UG_item.article_keywords.ToLower());
                        }
                    }
                }

                var result_UMK = dbContext.UMK_ArticlesSet.SqlQuery("SELECT * FROM UMK_ArticlesSet").ToList();
                if (result_UMK != null)
                {
                    foreach (var UMK_item in result_UMK)
                    {
                        if (UMK_item.article_title != null || UMK_item.article_title != String.Empty || UMK_item.article_Full_title != null || UMK_item.article_Full_title != String.Empty || UMK_item.article_translated_title != null || UMK_item.article_translated_title != String.Empty || UMK_item.article_publisher_title != null ||
                            UMK_item.article_publisher_title != String.Empty || UMK_item.article_eng_keywords != null || UMK_item.article_eng_keywords != String.Empty || UMK_item.article_pl_keywords != null || UMK_item.article_pl_keywords != String.Empty)
                        {
                            DocumentCollection.Add(UMK_item.article_title.ToLower()
                                                   + UMK_item.article_Full_title.ToLower()
                                                   + UMK_item.article_translated_title.ToLower()
                                                   + UMK_item.article_publisher_title.ToLower()
                                                   + UMK_item.article_eng_keywords.ToLower()
                                                   + UMK_item.article_pl_keywords.ToLower());
                        }
                    }
                }

                var result_WSB = dbContext.WSB_ArticlesSet.SqlQuery("SELECT * FROM WSB_ArticlesSet").ToList();
                if (result_WSB != null)
                {
                    foreach (var WSB_item in result_WSB)
                    {
                        if (WSB_item.article_title != null || WSB_item.article_title != String.Empty || WSB_item.article_common_title != null || WSB_item.article_common_title != String.Empty || WSB_item.article_title_other_lang != null || WSB_item.article_title_other_lang != String.Empty ||
                            WSB_item.article_pl_keywords != null || WSB_item.article_pl_keywords != String.Empty || WSB_item.article_eng_keywords != null || WSB_item.article_eng_keywords != String.Empty)
                        {
                            DocumentCollection.Add(WSB_item.article_title.ToLower()
                                                   + WSB_item.article_common_title.ToLower()
                                                   + WSB_item.article_title_other_lang.ToLower()
                                                   + WSB_item.article_pl_keywords.ToLower()
                                                   + WSB_item.article_eng_keywords.ToLower());
                        }
                    }
                }
            }

            /*
             * database_processing.Stop();
             * string processing_log = @"F:\Magistry files\Processing_log.txt";
             *
             * using (StreamWriter sw = File.AppendText(processing_log))
             * {
             *  sw.WriteLine(DateTime.Now.ToString() + " The database processing time is: " + database_processing.Elapsed.Minutes.ToString() + ":" + database_processing.Elapsed.TotalMilliseconds.ToString() + ", database context counter: " + counter2.ToString() + ", selection counter in one dbContext: " + counter1.ToString() + ", method executing counter: " + counter3.ToString());
             * }
             */
            return(DocumentCollection);
        }
Exemplo n.º 17
0
        internal static List <DocumentVector> DocumentCollectionProcessingDictionary(Dictionary <int, string> docCollectionDictionary)
        {
            parallelOption.MaxDegreeOfParallelism = 20;
            var vector_space_model_calculation = Stopwatch.StartNew();

            termHashset = new HashSet <string>();

            using (var dbContext = new ArticleDBDataModelContainer())
            {
                dbContext.Terms_Vocabulary.Load();

                foreach (var terms in dbContext.Terms_Vocabulary.Local)
                {
                    termHashset.Add(terms.term_value.ToLower());
                }
            }

            List <DocumentVector> documentVectorSpace = new List <DocumentVector>();
            DocumentVector        _documentVector;

            float[] space;
            int     index       = 0;
            var     arrayOfDocs = docCollectionDictionary.Keys.ToArray();

            Parallel.ForEach(docCollectionDictionary, parallelOption, document => {
                int count           = 0;
                space               = new float[termHashset.Count];
                var collectionValue = docCollectionDictionary.Values.ToList();

                foreach (string term in termHashset)
                {
                    space[count] = Logic.ClusteringAlgorithms.Used_functions.TFIDF2ndrealization.FindTFIDF(collectionValue, document.Value, term);
                    count++;
                }
                for (int i = 0; i < arrayOfDocs.Length; i++)
                {
                    if (arrayOfDocs[i] == document.Key)
                    {
                        index = i;
                    }
                }

                _documentVector = new DocumentVector();

                _documentVector.ArticleID = document.Key;
                _documentVector.index_Of_Doc_for_labeling = index;
                _documentVector.Content     = document.Value;
                _documentVector.VectorSpace = space;
                documentVectorSpace.Add(_documentVector);
            });
            vector_space_model_calculation.Stop();

            string processing_log = @"F:\Magistry files\Processing_log.txt";

            using (StreamWriter sw = File.AppendText(processing_log))
            {
                sw.WriteLine(DateTime.Now.ToString() + " The vector space model calculation time is: " + vector_space_model_calculation.Elapsed.Minutes.ToString() + ":" + vector_space_model_calculation.Elapsed.TotalMilliseconds.ToString());
            }

            return(documentVectorSpace);
        }
        public static List <string> GenerateTermCollection()
        {
            List <string> TermCollection = new List <string>();

            char[] not_allowedChars = { '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '<', '>', 'x', '!', '#', '$', '%', '^', '&', '*', '(', ')', '/', '\'' };

            using (var dbContext = new ArticleDBDataModelContainer())
            {
                var resul_PG = dbContext.Terms_Vocabulary.SqlQuery("SELECT * FROM dbo.Terms_Vocabulary").ToList();
                if (resul_PG != null)
                {
                    foreach (var item in resul_PG)
                    {
                        if (item.term_value != null || item.term_value != String.Empty)
                        {
                            TermCollection.Add(item.term_value.ToLower());
                        }
                    }
                }
            }


            string dictionary_text = File.ReadAllText(@"F:\Magistry files\csv_files\Allowed_term_dictionary.csv");

            string[] allowed_dictionary = dictionary_text.Split(',', '\n');

            for (int i = 0; i <= TermCollection.Count - 1; i++)
            {
                #region new_code_for_Cleaning_termVocabulary
                for (int k = 0; i < TermCollection[i].Length; k++)
                {
                    for (int z = 0; z < not_allowedChars.Length; z++)
                    {
                        if (TermCollection[i].ElementAt(k) == not_allowedChars[z])
                        {
                            TermCollection[i].Remove(k, 1);
                        }
                    }
                }
                #endregion

                for (int j = 0; j <= allowed_dictionary.Length - 1; j++)
                {
                    if (TermCollection[i].Length <= 3 && (!TermCollection[i].Contains(allowed_dictionary[j])))
                    {
                        TermCollection.RemoveAt(i);
                    }
                    else if (TermCollection[i].Contains(")") || TermCollection[i].Contains("("))
                    {
                        TermCollection.RemoveAt(i);
                    }
                    else if (TermCollection[i].Contains("]") || TermCollection[i].Contains("["))
                    {
                        TermCollection.RemoveAt(i);
                    }
                    else if (TermCollection[i].Contains("*") || TermCollection[i].Contains("*"))
                    {
                        TermCollection.RemoveAt(i);
                    }
                    else
                    {
                        continue;
                    }
                }
            }

            for (int i = 0; i <= TermCollection.Count - 1; i++)
            {
                for (int j = 0; j <= TermCollection.Count - 1; j++)
                {
                    if ((TermCollection[i] == TermCollection[j]) || TermCollection[i].Contains(TermCollection[j].Substring(0)))
                    {
                        TermCollection.RemoveAt(j);
                    }
                }
            }

            return(TermCollection);
        }
Exemplo n.º 19
0
        public static string TermsPrepataions(string _text)
        {
            ArticleDBDataModelContainer dbContainer = new ArticleDBDataModelContainer();
            List <string> resultDBList = new List <string>();
            var           TermsDBList  = dbContainer.Set <Terms_Vocabulary>();

            foreach (var element in TermsDBList)
            {
                resultDBList.Add(element.term_value);
            }
            //here we will have the list of terms



            var           text_preparation = Stopwatch.StartNew();
            string        text             = _text;
            List <string> Words            = new List <string>();

            char[]   splitChars     = { ' ', ',', '.', ';', '-', ':' };
            string[] removableWords = { "and", "or", "it", "at", "all", "in", "on", "under", "between", "a", "an", "the", "to", "pod", "nad", "tam", "tutaj", "między", "pomiędzy", "w", "przed", "się", "z", "na", "od", "jest", "iż", "co", "we", "ich", "ciebie", "ja", "ty", "ona", "ono", "oni", "owych", "of", "cz", "do", "s", "n", "r", "nr", "rys", "i", "by", "from", "o", "//", "**", "po", "jej", "przy", "rzecz", "jak", "tymi", "są", "czy", "oraz", "ze", "m", "p", "off", "for", "/", "is", "as", "be", "will", "go", "za", "też", "lub", "t", "poz", "wiad", "set", "use", "etc", "also", "are", "tzw", "out", "other", "its", "has", "<", ">", "pre", "its", "has", "are", "with", "[et", "]", "vol", "leszek", "j", "al", "tych", "tym" };
            //Regex reqular_expression = new Regex(@"(\d)\)+");
            Regex regular_expression = new Regex("[^0-9A-Za-z]+");

            if (!string.IsNullOrEmpty(text))
            {
                string[] splittedTitle = text.ToLower().Split(splitChars, StringSplitOptions.RemoveEmptyEntries);
                Words = splittedTitle.ToList();
            }
            else
            {
                return("NULL");
            }
            char[] not_allowed_chars = { '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '!', '@', '#', '$', '%', '^', '&', '\'', '"', '[', ']', '{', '}', '(', ')' };
            for (int k1 = 0; k1 < resultDBList.Count; k1++)
            {
                foreach (var word in Words.ToList())
                {
                    foreach (var not_allow_ch in not_allowed_chars)
                    {
                        if (word.Length < 3 | word.Contains(not_allow_ch) | resultDBList.Contains(word) | resultDBList[k1] == word)
                        {
                            Words.Remove(word);
                        }
                    }
                }
            }

            var stemmer = new EnglishStemmer();

            var stemmingList = Words.ToList().Where(w => w.Any(c => !Char.IsDigit(c))).ToList();

            HashSet <string> stemmingHashSet = new HashSet <string>();

            foreach (var term in Words)
            {
                stemmingHashSet.Add(term);
            }

            Words = stemmingHashSet.ToList();

            var splittedTitle1 = Words.ToArray();

            string dictionary_text = File.ReadAllText(@"F:\Magistry files\csv_files\Allowed_term_dictionary.csv");

            string[] allowed_dictionary = dictionary_text.Split(',', '\n');

            for (int i = 0; i <= splittedTitle1.Length - 1; i++)
            {
                for (int j = 0; j <= allowed_dictionary.Length - 1; j++)
                {
                    if (splittedTitle1[i].Length > 3 && splittedTitle1[i].Contains(allowed_dictionary[j]))
                    {
                        continue;
                    }
                    else if (splittedTitle1[i].Length <= 3 && !(splittedTitle1[i].Contains(allowed_dictionary[j])))
                    {
                        splittedTitle1.ToList().RemoveAt(i);
                    }
                }
            }

            var stemmingString  = string.Join(" ", splittedTitle1.Except(removableWords).Distinct());
            var stemmingString1 = regular_expression.Replace(stemmingString, String.Empty);

            text_preparation.Stop();

            //System.Windows.MessageBox.Show("The text processing time is: "+ text_preparation.Elapsed.Minutes.ToString() + ":" + text_preparation.Elapsed.TotalMilliseconds, "Text processing time" ,System.Windows.MessageBoxButton.OK);

            string processing_log = @"F:\Magistry files\Processing_log.txt";

            using (StreamWriter sw = File.AppendText(processing_log))
            {
                sw.WriteLine(DateTime.Now.ToString() + "The text processing time is: " + text_preparation.Elapsed.Minutes.ToString() + ":" + text_preparation.Elapsed.TotalMilliseconds.ToString());
            }

            Debug.WriteLine("The text processing time is: " + text_preparation.Elapsed.Minutes.ToString() + ":" + text_preparation.Elapsed.TotalMilliseconds, "Text processing time", System.Windows.MessageBoxButton.OK);

            return(stemmingString);
        }