public static List <string> GeodesyClasseOfDocuments_ListCreations() { List <string> GeodesyClass = new List <string>(); #region Geodesy_Class_Documnets /* * GeodesyClass.Add(" Metody analizy obiektowej w badanich środowiska morskiego "+ * " Monografia przedstawia metody klasyfikacji obrazów opierające się na analizie obiektowej. Autorzy prezentują wyniki eksperymentu dające podstawy do oceny analizy obiektowej jako konkurencyjne do klasyfikacji prowadzonej przez człowieka metodami manualnymi. "+ * " GEODEZJA MORSKA GEOMATYKA SYSTEMY INFORMACJI PRZESTRZENNEJ TELEDETEKCJA "); * GeodesyClass.Add(" Metody analizy obiektowej w badanich środowiska morskiego "+ * " Monografia przedstawia metody klasyfikacji obrazów opierające się na analizie obiektowej. Autorzy prezentują wyniki eksperymentu dające podstawy do oceny analizy obiektowej jako konkurencyjne do klasyfikacji prowadzonej przez człowieka metodami manualnymi. Autorzy: Katarzyna Mokwa, Marek Przyborski, Jerzy Pyrchla. Redaktor serii: Jakub Szulwic "+ * " GEODEZJA MORSKA GEOMATYKA SYSTEMY INFORMACJI PRZESTRZENNEJ TELEDETEKCJA "); * GeodesyClass.Add(" Propozycja wykorzystania intensywności do wspomagania przetwarzania oryginalnej i zoptymalizowanej chmury punktów ALS "+ * " Skaning lotniczy i przetwarzanie wyników - optymalizacja i klasyfikacja danych. "+ * " GEODEZJA SKANING LASEROWY "); * GeodesyClass.Add(" M-Split Estimation in Laser Scanning Data Modeling "+ * " Publikacja traktuje o wykorzystaniu estymacji M-Split do modelowania danych pozyskanych w wyniku skaningu laserowego. Autorzy prezentują rozwiązanie w oparciu o detekcję krawędzi dwóch płaszczyzn. "+ * " DETEKCJA KRAWĘDZI GEODEZJA M-SPLIT SKANING LASEROWY "); */ #endregion using (var dbContext = new ArticleDBDataModelContainer()) { var content = dbContext.PG_ArticlesSet.SqlQuery(@"SELECT * FROM dbo.PG_ArticlesSet WHERE (PG_ArticlesSet.abstractText LIKE '%GEODE%' OR PG_ArticlesSet.keywords LIKE '%GEODE%') OR (PG_ArticlesSet.abstractText LIKE '%GEODE%' OR PG_ArticlesSet.keywords LIKE '%GEODE%')"); foreach (var item in content) { GeodesyClass.Add(item.title + item.abstractText + item.keywords); } } return(GeodesyClass); }
public static void GenerateAuthorsToCSVandJsonFromDB(string authorsCSV, string authorsJson) { string authorsContentCSV = string.Empty; string jsonContent = "var authors = ["; authorsContentCSV += authorHeaderCSV; using (var AuthorDBContext = new ArticleDBDataModelContainer()) { var authors_Result = AuthorDBContext.AuthorSet.SqlQuery("SELECT * FROM dbo.AuthorSet").ToList(); if (authors_Result != null) { foreach (var item in authors_Result) { AuthorsJsonObj authorsJsonObj = new AuthorsJsonObj(item.author_Id, item.author_name, item.author_surename); authorsContentCSV += ("\"" + item.author_Id + "\",") + ("\"" + item.author_name + "\",") + ("\"" + item.author_surename + "\"") + '\n'; jsonContent += JsonConvert.SerializeObject(authorsJsonObj) + '\n'; } } } jsonContent += "]"; using (StreamWriter sw = File.AppendText(authorsCSV)) { sw.Write(authorsContentCSV); } using (StreamWriter json_SW = File.AppendText(authorsJson)) { json_SW.Write(jsonContent); } }
public static string SelectAutorsFromDB(int ArticleID) { List <string> Author_list = new List <string>(); using (var article = new ArticleDBDataModelContainer()) { Author_list = article.Database.SqlQuery <string>("SELECT authors FROM dbo.PG_ArticlesSet WHERE article_Id=" + ArticleID.ToString()).ToList(); if (Author_list.Count < 1) { Author_list = article.Database.SqlQuery <string>("SELECT article_author_line FROM dbo.PP_ArticlesSet WHERE article_Id=" + ArticleID.ToString()).ToList(); } else if (Author_list.Count < 1) { Author_list = article.Database.SqlQuery <string>("SELECT article_author_line FROM dbo.UG_ArticlesSet WHERE article_Id=" + ArticleID.ToString()).ToList(); } else if (Author_list.Count < 1) { Author_list = article.Database.SqlQuery <string>("SELECT article_authors_line FROM dbo.UMK_ArticlesSet WHERE article_Id=" + ArticleID.ToString()).ToList(); } else if (Author_list.Count < 1) { Author_list = article.Database.SqlQuery <string>("SELECT article_authors FROM dbo.WSB_ArticlesSet WHERE article_Id=" + ArticleID.ToString()).ToList(); } } string Authors = string.Join(", ", Author_list.ToArray()); return(Authors); }
public static List <string> SurveyAndMeasurementsClassOfDocuments_ListCreations() { List <string> SurveyAndMeasurementsClass = new List <string>(); using (var dbContext = new ArticleDBDataModelContainer()) { var content = dbContext.PG_ArticlesSet.SqlQuery(@"SELECT * FROM dbo.PG_ArticlesSet WHERE (PG_ArticlesSet.abstractText LIKE '%BADAN%' OR PG_ArticlesSet.keywords LIKE '%BADAN%') OR (PG_ArticlesSet.abstractText LIKE '%POMIAR%' OR PG_ArticlesSet.keywords LIKE '%POMIAR%')"); foreach (var item in content) { SurveyAndMeasurementsClass.Add(item.title + item.abstractText + item.keywords); } } return(SurveyAndMeasurementsClass); }
public static HashSet <string> getTermCollection() { termCollection = new HashSet <string>(); using (var dbContext = new ArticleDBDataModelContainer()) { dbContext.Terms_Vocabulary.Load(); foreach (var terms in dbContext.Terms_Vocabulary.Local) { termCollection.Add(terms.term_value.ToLower()); } } return(termCollection); }
public static Dictionary <int, string> GenerateDocumentCollection_withoutLazyLoadingToDictionary() { Dictionary <int, string> DocumentCollection = new Dictionary <int, string>(); int counter1 = 0; int counter2 = 0; int counter3 = 0; var database_processing = Stopwatch.StartNew(); using (var dbContext = new ArticleDBDataModelContainer()) { dbContext.PG_ArticlesSet.Load(); foreach (var PG_articles in dbContext.PG_ArticlesSet.Local) { string PG_record = PG_articles.title + PG_articles.abstractText + PG_articles.keywords; if (!(DocumentCollection.ContainsKey(PG_articles.article_Id)) || !(DocumentCollection.ContainsValue(PG_record))) { DocumentCollection.Add(Convert.ToInt32(PG_articles.article_Id), PG_record); } else { continue; } counter1++; } dbContext.PP_ArticlesSet.Load(); foreach (var PP_articles in dbContext.PP_ArticlesSet.Local) { string PP_record = PP_articles.article_title + PP_articles.article_source; if (!(DocumentCollection.ContainsKey(PP_articles.article_Id)) || !(DocumentCollection.ContainsValue(PP_record))) { DocumentCollection.Add(Convert.ToInt32(PP_articles.article_Id), PP_record); } else { continue; } counter1++; } dbContext.UG_ArticlesSet.Load(); foreach (var UG_articles in dbContext.UG_ArticlesSet.Local) { string UG_record = UG_articles.article_title + UG_articles.article_source + UG_articles.article_keywords; if (!(DocumentCollection.ContainsKey(UG_articles.article_Id)) || !(DocumentCollection.ContainsValue(UG_record))) { DocumentCollection.Add(Convert.ToInt32(UG_articles.article_Id), UG_record); } else { continue; } counter1++; } dbContext.UMK_ArticlesSet.Load(); foreach (var UMK_articles in dbContext.UMK_ArticlesSet.Local) { string UMK_record = UMK_articles.article_title + UMK_articles.article_Full_title + UMK_articles.article_eng_keywords + UMK_articles.article_pl_keywords + UMK_articles.article_translated_title; if (!(DocumentCollection.ContainsKey(UMK_articles.article_Id)) || !(DocumentCollection.ContainsValue(UMK_record))) { DocumentCollection.Add(Convert.ToInt32(UMK_articles.article_Id), UMK_record); } else { continue; } counter1++; } dbContext.WSB_ArticlesSet.Load(); foreach (var WSB_articles in dbContext.WSB_ArticlesSet.Local) { string WSB_record = WSB_articles.article_title + WSB_articles.article_common_title + WSB_articles.article_title_other_lang + WSB_articles.article_eng_keywords + WSB_articles.article_pl_keywords + WSB_articles.article_details; if (DocumentCollection.ContainsKey(WSB_articles.article_Id)) { continue; } else { DocumentCollection.Add(Convert.ToInt32(WSB_articles.article_Id), WSB_record); } counter1++; } counter2++; } counter3++; database_processing.Stop(); //System.Windows.MessageBox.Show("The database processing time is: " + database_processing.Elapsed.Minutes.ToString() + ":" + database_processing.Elapsed.TotalMilliseconds, "Database processing time" ,System.Windows.MessageBoxButton.OK); string processing_log = @"F:\Magistry files\Processing_log.txt"; using (StreamWriter sw = File.AppendText(processing_log)) { sw.WriteLine(DateTime.Now.ToString() + " The database processing time is: " + database_processing.Elapsed.Minutes.ToString() + ":" + database_processing.Elapsed.TotalMilliseconds.ToString() + ", database context counter: " + counter2.ToString() + ", selection counter in one dbContext: " + counter1.ToString() + ", method executing counter: " + counter3.ToString()); } return(DocumentCollection); }
/// <summary> /// In what way we can chose the classes for document assign? /// -The organizations assigned to documents - not all documents has assigned organizations /// -Using title/abstract/keywords /// How can we automate the process of class assigment do documents? /// </summary> /// <returns></returns> public static List <string> ArchitectureClasseOfDocuments_ListCreations() { List <string> ArchitectureClass = new List <string>(); #region Architectrure_Class_Documnets /* * ArchitectureClass.Add(" architektura "+ * " Wystawa prac graficznych, rysunków i malarstwa obrazujacych rolę sztuk plastycznych w procesie kreowania form architektonicznych. Prezentacja autorskiej metod nauczania: Zapis-Interpretacja-transformacja oraz metody ideograficznej. "+ * " ARCHITEKTURA ARCHITEKTURA WSPÓŁCZESNA IDEOGRAM KOMPOZYCJA SZTUKA "); * ArchitectureClass.Add(" Projekt stanowisk badawczych siłowni kogeneracyjnych i wirującej tarczy " + * " Projekt budowlany "+ * " ARCHITEKTURA PRZEMYSŁOWA "); * ArchitectureClass.Add(" Projekt fundamentów pod urządzenia siłowni kogeneracyjnych "+ * " Projekt wykonawczy "+ * " ARCHITEKTURA PRZEMYSŁOWA "); * ArchitectureClass.Add(" Budynek mieszkalny wielorodzinny "+ * " Projekt budowlany zamienny "+ * " ARCHITEKTURA MIESZKANIOWA "); * ArchitectureClass.Add(" Projekt zamienny do projektu zmiana funkcji domu mieszkalnego z funkcją agroturystyczną na funkcję hotelową " + * " Projekt budowlany "+ * " ARCHITEKTURA HOTELOWA "); * ArchitectureClass.Add(" Projekt zamienny - zmiana funkcji domu mieszkalnego z funkcją agroturystyczną na funkcję hotelową "+ * " Projekt budowlany "+ * " ARCHITEKTURA HOTELOWA "); * ArchitectureClass.Add(" Budynek mieszkalny jednorodzinny " + * " Projekt budowlany " + * " ARCHITEKTURA MIESZKANIOWA "); * ArchitectureClass.Add(" Areszt śledczy w Starogardzie GdańskimRozbudowa budynku penitencjarnego o pomieszczenia ambulatorium " + * " Projekt architektoniczno - budowlany. "+ * " ARCHITEKTURA PENITENCJARNA ARCHITEKTURA SŁUZBY ZDROWIA "); * ArchitectureClass.Add(" Przebudowa i rozbudowa budynku Instytutu Pamięci Narodowej ze zmiana funkcji z produkcyjnej na administracyjno-magazynową "+ * " Projekt architektoniczno-budowlany "+ * " ARCHITEKTURA UŻYTECZNOŚCI PUBLICZNEJ "); * ArchitectureClass.Add(" Przebudowa i rozbudowa budynku Instytutu Pamieci Narodowej - Komisja Scigania Zbrodni przeciwko Narodowi Polskiemu ze zmianą funkcji z produlcji na administracyjno-magazynową. "+ * " Projekt architektoniczno-budowlany "+ * " ARCHITEKTURA MIESZKANIOWA "); * ArchitectureClass.Add(" WATER CUBE - inżynieryjna metafora wody "+ * " Artykuł prezentuje innowacyjne rozwiązania w inteligentnym obiekcie pływalni olimpijskiej w Pekinie. "+ * " ARCHITEKTURA OBIEKTÓW SPORTOWYCH BUDYNEK INTELIGENTNY "); * ArchitectureClass.Add(" Budynek hotelu z wbudowaną kotłownią "+ * " Projekt budowlany "+ * " ARCHITEKTURA HOTELOWA "); * ArchitectureClass.Add(" Budynek mieszkalny jednorodzinny "+ * " Projekt budowlany "+ * " ARCHITEKTURA MIESZKANIOWA "); * ArchitectureClass.Add(" Budynek mieszkalny jednorodzinny z podziemnym zbiornikiem na ścieki sanitarne "+ * " Projekt budowlany "+ * " ARCHITEKTURA MIESZKANIOWA "); * ArchitectureClass.Add(" Projekt budowlany remont elewacji oraz docieplenie scian zewnętrznych i stropodachu " + * " Projekt budowlany "+ * " ARCHITEKTURA "); * ArchitectureClass.Add(" Remont klatki schodowej Ministerstwo Sprawiedliwości "+ * " Projekt wykonawczy "+ * " ARCHITEKTURA UŻYTECZNOŚCI PUBLICZNEJ "); */ #endregion using (var dbContext = new ArticleDBDataModelContainer()) { var content = dbContext.PG_ArticlesSet.SqlQuery(@"SELECT * FROM dbo.PG_ArticlesSet WHERE PG_ArticlesSet.abstractText LIKE '%ARCHITEKT%' OR PG_ArticlesSet.keywords LIKE '%ARCHITEKT%' OR PG_ArticlesSet.abstractText LIKE '%ARCHITEKT%';"); foreach (var item in content) { ArchitectureClass.Add(item.title + item.abstractText + item.keywords); } } return(ArchitectureClass); }
public static List <DocumentVector> DocumentCollectionProcessing(List <String> collection) { parallelOption.MaxDegreeOfParallelism = 20; var vector_space_model_calculation = Stopwatch.StartNew(); //dTerms = new HashSet<string>(); //documentCollection = CreateDocumentCollection.GenerateCollection(); #region old_parts_of_code /*foreach (string documentContent in documentCollection) * { * foreach (string term in r.Split(documentContent)) * { * if (!StopWordsHandler.IsStotpWord(term)) * dTerms.Add(term); * else * continue; * } * } * List<string> removeList = new List<string>() { "\"", "\r", "\n", "(", ")", "[", "]", "{", "}", "", ".", " ", "," }; * foreach (string s in removeList) * { * dTerms.Remove(s); * }*/ #endregion termHashset = new HashSet <string>(); using (var dbContext = new ArticleDBDataModelContainer()) { dbContext.Terms_Vocabulary.Load(); foreach (var terms in dbContext.Terms_Vocabulary.Local) { termHashset.Add(terms.term_value.ToLower()); } } /* * foreach(var items in termHashset) * { * dTerms.Add(items.ToLower()); * } */ List <DocumentVector> documentVectorSpace = new List <DocumentVector>(); DocumentVector _documentVector; float[] space; // trying to optimize execution time 04.10.2017 //foreach (string document in documentCollection) Parallel.ForEach(collection, parallelOption, document => { int count = 0; space = new float[termHashset.Count]; //space = new float[dTerms.Count]; //foreach (string term in dTerms) foreach (string term in termHashset) { //space[count] = CalculateTFIDF.FindTFIDF(collection, document, term); space[count] = Logic.ClusteringAlgorithms.Used_functions.TFIDF2ndrealization.FindTFIDF(collection, document, term); count++; } _documentVector = new DocumentVector(); #region dont_usable_now //last changes 21.05.2018 /* * using (var PGDbContext = new ArticleDBDataModelContainer()) * { * foreach (var article in PGDbContext.PG_ArticlesSet) * { * string PG_article = article.title + article.abstractText + article.keywords; * if (PG_article.Contains(document)) * _documentVector.ArticleID = article.article_Id; * } * } * using (var PPDbContext = new ArticleDBDataModelContainer()) * { * foreach (var article in PPDbContext.PP_ArticlesSet) * { * string PP_record = article.article_title + article.article_source; * if (PP_record.Contains(document)) * _documentVector.ArticleID = article.article_Id; * } * } * using (var UMKDbContext = new ArticleDBDataModelContainer()) * { * foreach (var article in UMKDbContext.UMK_ArticlesSet) * { * string UMK_record = article.article_title + article.article_Full_title + article.article_eng_keywords + article.article_pl_keywords + article.article_translated_title; * if (UMK_record.Contains(document)) * _documentVector.ArticleID = article.article_Id; * } * } * using (var UGDbContext = new ArticleDBDataModelContainer()) * { * foreach (var article in UGDbContext.UG_ArticlesSet) * { * string UG_record = article.article_title + article.article_source + article.article_keywords; * if (UG_record.Contains(document)) * _documentVector.ArticleID = article.article_Id; * } * * } * using (var WSBDbContext = new ArticleDBDataModelContainer()) * { * foreach (var article in WSBDbContext.WSB_ArticlesSet) * { * string WSB_record = article.article_title + article.article_common_title + article.article_title_other_lang + article.article_eng_keywords + article.article_pl_keywords + article.article_details; * if (WSB_record.Contains(document)) * _documentVector.ArticleID = article.article_Id; * } * } */ #endregion _documentVector.Content = document; _documentVector.VectorSpace = space; _documentVector.index_Of_Doc_for_labeling = collection.IndexOf(document); documentVectorSpace.Add(_documentVector); }); /* * foreach(string document in collection) * { * int count = 0; * space = new float[dTerms.Count]; * foreach (string term in dTerms){ * space[count] = CalculateTFIDF.FindTFIDF(collection,document, term); * count++; * } * * _documentVector = new DocumentVector(); * _documentVector.Content = document; * _documentVector.VectorSpace = space; * documentVectorSpace.Add(_documentVector); * //tu mamy 2296 termow * //ClusteringAlgorithms.Used_functions.Normalization.Normilize_Term_Frequency(documentVectorSpace); // are that the correct place to perform normalization? * * } */ vector_space_model_calculation.Stop(); string processing_log = @"F:\Magistry files\Processing_log.txt"; using (StreamWriter sw = File.AppendText(processing_log)) { sw.WriteLine(DateTime.Now.ToString() + " The vector space model calculation time is: " + vector_space_model_calculation.Elapsed.Minutes.ToString() + ":" + vector_space_model_calculation.Elapsed.TotalMilliseconds.ToString()); } return(documentVectorSpace); }
public static void LoadBibtexFile() { string[] fileEntries = Directory.GetFiles(filePathBibtex); char[] not_allowedCharsforArticle = { '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '<', '>', 'x', '!', '#', '$', '%', '^', '&', '*', '(', ')', '/', '\'' }; string[] new_document = new string[0]; foreach (string file in fileEntries) { using (StreamReader reader = new StreamReader(File.OpenRead(file))) { if (reader.ToString() != null || !reader.ToString().Contains("title =")) { context = new string[14]; separatedContext = new string[2]; for (int i = 0; i <= context.Count() - 1; i++) { context[i] = reader.ReadLine(); if (context[i] != null || context[i] == "}") { try { Console.WriteLine("Processing " + i.ToString() + " line."); context[i] = context[i].TrimStart(' ').Replace('\"', ' ').Replace('\\', ' ').TrimEnd(','); separatedContext = context[i].Split(separators, 2, StringSplitOptions.RemoveEmptyEntries); #region getVariables if (separatedContext[0].Contains("title")) { #region little_modification_for_title_clearing /* * for (int a = 0; a < separatedContext[1].Length; a++) * { * for (int b = 0; b < not_allowedCharsforArticle.Length; b++) * { * if (separatedContext[1].ElementAt(a) == not_allowedCharsforArticle[b]) * separatedContext[1].Remove(a, 1); * } * } */ #endregion if (separatedContext[1].Length >= 2) { _title = separatedContext[1]; } } else if (separatedContext[0].Contains("abstract")) { #region little_modification_for_abstract_clearing for (int a = 0; a < separatedContext[1].Length; a++) { for (int b = 0; b < not_allowedCharsforArticle.Length; b++) { if (separatedContext[1].ElementAt(a) == not_allowedCharsforArticle[b]) { separatedContext[1].Remove(a, 1); } } } #endregion if (separatedContext[1].Length >= 5) { _abstract = separatedContext[1]; } } else if (separatedContext[0].Contains("keywords")) { if (separatedContext[1] != String.Empty || separatedContext[1] != " ") { _keywords = separatedContext[1]; } else { continue; } } else if (separatedContext[0].Contains("year")) { //year filter //if (Convert.ToInt32(separatedContext[1]) >= 1960) _year = Convert.ToInt32(separatedContext[1]); //else continue; } else if (separatedContext[0].Contains("country")) { _country = separatedContext[1]; } else if (separatedContext[0].Contains("author")) { _authorsLine = separatedContext[1]; _authors = separatedContext[1].Split(authorSeparator, StringSplitOptions.RemoveEmptyEntries); } else if (separatedContext[0].Contains("organization")) { _organization = separatedContext[1]; } else if (separatedContext[0].Contains("url")) { _url = separatedContext[1]; } else { continue; } #endregion } catch (Exception ex) { //if (ex.InnerException.GetType() == typeof(IndexOutOfRangeException)) //{ //File.WriteAllText(@"F:\\Magistry files\PG_crawler_Log.txt", ex.ToString()); //return; //} continue; } } } } } #region bibtexLibrary /* * if(reader.ToString() != null) * { * string fileEntry = reader.ReadToEnd(); * string fileEntry_filter1 = fileEntry.Replace('*', ' '); * //string fileEntry_filter2 = fileEntry_filter1.Replace('{', ' '); * // string fileEntry_filter3 = fileEntry_filter2.Replace('}', ' '); * string fileEntry_filter2 = fileEntry_filter1.Replace('/', ' '); * if (fileEntry_filter2!=String.Empty && fileEntry_filter2.Contains("title = ") && fileEntry_filter2 != null) * { * BibTeXLibrary.BibParser parser = new BibParser(new StringReader(fileEntry)); * var entry = parser.GetAllResult()[0]; * if(!entry.ToString().Contains("publication100010")) * { * Console.WriteLine(entry["title"]); * Console.WriteLine(entry["abstract"]); * Console.WriteLine(entry["keywords"]); * Console.WriteLine(entry["year"]); * Console.WriteLine(entry["author"]); * Console.WriteLine(entry["organization"]); * Console.WriteLine(entry["url"]); * } * else * { * file.Skip(1); * } * } * else if (fileEntry_filter2 == String.Empty || !fileEntry_filter2.Contains("title = ") || fileEntry_filter2 == null) * { * file.Skip(1); * } * else{ * Console.WriteLine("Error!"); * return; * } */ #endregion try { #region Bibtex_Entity_Object_Creation_Model_First // using (var dbContext = new ArticleDBDataModelContainer()) { var document = new StringBuilder(); var bibtexArticle = dbContext.PG_ArticlesSet.Create(); bibtexArticle.title = _title; if (_title != String.Empty || _title != " " || _title != null) { var termTitle = TextPreparing.TermsPrepataions(_title); document.Append(termTitle); } _title = null; bibtexArticle.abstractText = _abstract; if (_abstract != String.Empty || _abstract != " " || _abstract != null) { var termAbstract = TextPreparing.TermsPrepataions(_abstract); document.Append(termAbstract); } _abstract = null; bibtexArticle.keywords = _keywords; if (_keywords != String.Empty || _keywords != " " || _keywords != null) { var termKeywords = TextPreparing.TermsPrepataions(_keywords); document.Append(termKeywords); } _keywords = null; bibtexArticle.year = _year; bibtexArticle.country = _country; _country = null; bibtexArticle.authors = _authorsLine; _authorsLine = null; //potrzebnie dorobic dodawanie autorow po 2 wartosci z tabeli authors[] do klasy Entity Authors bibtexArticle.organizations = _organization; _organization = null; bibtexArticle.url = _url; _url = null; for (int i = 0; i <= _authors.Length - 2;) { var authors_of_the_article = dbContext.AuthorSet.Create(); authors_of_the_article.author_name = _authors[i]; authors_of_the_article.author_surename = _authors[i + 1]; bibtexArticle.Author.Add(authors_of_the_article); i += 2; } dbContext.PG_ArticlesSet.Add(bibtexArticle); var _document = document.ToString().Split(' ', ';', ':', ','); //dodano 11.02 for (int p = 0; p < _document.Length; p++) { for (int z = 0; z < not_allowedCharsforArticle.Length; z++) { if (_document[p].Contains(not_allowedCharsforArticle[z])) { _document[p].Remove(z, 1); } } //dodano 11.02 List <string> stringHashSet = new List <string>(); stringHashSet = _document.ToList(); foreach (var element in stringHashSet) { if (element == String.Empty || element == null || element == " ") { stringHashSet.Remove(element); } else if (element.Length <= 3) { stringHashSet.Remove(element); } } new_document = stringHashSet.ToArray(); } for (int k = 0; k <= new_document.Length - 1; k++) { var terms = dbContext.Terms_Vocabulary.Create(); string dictionary_text = File.ReadAllText(@"F:\Magistry files\csv_files\Allowed_term_dictionary.csv"); string[] allowed_dictionary = dictionary_text.Split(',', '\n'); #region old_cleaning_code_11.02.2018 //added 10.02.2018 - cleaninig the article list /* * for (int i = 0; i <= new_document.Length - 1; i++) * { * for (int j = 0; j <= allowed_dictionary.Length - 1; j++) * { * if (new_document[i].Length > 3 && new_document[i].Contains(allowed_dictionary[j])) * { * continue; * } * else if (new_document[i].Length < 3 && !(new_document[i].Contains(allowed_dictionary[j]))) * { * new_document.ToList().RemoveAt(i); * } * } * } */ #endregion #region old_version_11.02.2018 //tutaj potrzebnie przepisac id dokumenta w ktorym wystepuje dane slowo //if (new_document[k] != String.Empty || new_document[k] != " " || new_document[k] != null || new_document[k] != Char.IsDigit(' ').ToString()) //{ //dbContext.Terms_Vocabulary.Where(u) #endregion //var termVocabularyTable = dbContext.Terms_Vocabulary; /* 21.08 dont't work properly - under fix * // 21.08 If need fast but not accurate - don't use this * for (int i=0; i<k; i++) * { * var query = GetTerms_Vocabulary(dbContext); * var query_list = new List<Terms_Vocabulary>(); * foreach(var element in query) * { * query_list = query.ToList(); * } * //if (query_list.Count == 0) * for(int j = 0; i < query_list.Count; j++) * { * if (query_list[j].term_value != new_document[k] | !(query_list[j].term_value.Contains(new_document[k]))) * { * terms.term_value = new_document[k]; * bibtexArticle.Terms_Vocabulary.Add(terms); * } * else * continue; * } * } * // */ terms.term_value = new_document[k]; //-- 21.08 old and fast but not effective //} bibtexArticle.Terms_Vocabulary.Add(terms); //-- 21.08 old and fast but not effective } dbContext.SaveChanges(); } #endregion ///<summary> /// BibtexArticle_Entity_Object_Creation /// </summary> #region BibtexArticle_Entity_Object_Creation /* * using (var db = new PublicationsContext()) * { * var bibtexArticle = new BibtexArticle(); * bibtexArticle.title = _title; * _title = null; * bibtexArticle.abstractText = _abstract; * _abstract = null; * bibtexArticle.keywords = _keywords; * _keywords = null; * bibtexArticle.year = _year; * bibtexArticle.country = _country; * _country = null; * bibtexArticle.authors = _authorsLine; * _authorsLine = null; * //potrzebnie dorobic dodawanie autorow po 2 wartosci z tabeli authors[] do klasy Entity Authors * bibtexArticle.organizations = _organization; * _organization = null; * bibtexArticle.url = _url; * _url = null; * * * var authors_of_the_article = new Authors(); * for (int i = 0; i <= _authors.Length - 2; i++) * { * authors_of_the_article.author_name = _authors[i]; * authors_of_the_article.author_surename = _authors[i + 1]; * bibtexArticle.author_Id = authors_of_the_article.author_Id; * db.Authors.Add(authors_of_the_article); * } * * db.PG_Articles.Add(bibtexArticle); * db.SaveChanges(); * } */ #endregion Console.WriteLine("End of file! Go to the next ->"); } catch (Exception ex) { File.WriteAllText(@"F:\\Magistry files\PG_crawler_Log.txt", ex.ToString()); } } }
public static void get_PP_Document_content() { string[] PP_newcontent = new string[hapDoc.DocumentNode.InnerText.Length]; string[] PP_separatedContent = new string[hapDoc.DocumentNode.InnerText.Length]; PP_articles_Count = 0; string[] PP_articles_Matrix = { String.Empty }; using (StringReader sr = new StringReader(endText)) { int p = 0; string PP_line; while ((PP_line = sr.ReadLine()) != null) { PP_newcontent[p] = PP_line; PP_separatedContent = PP_line.Split(line_separator, 2); if (PP_separatedContent.Length == 1 & PP_separatedContent[0] == "") { continue; } else if (PP_separatedContent.Length == 1 && PP_articles_Matrix.Any(x => PP_separatedContent[0].Contains(x))) { if (PP_author_line != null && PP_Tytul != null) { try { using (var PPdbContext = new ArticleDBDataModelContainer()) { var document = new StringBuilder(); var pp_article = PPdbContext.PP_ArticlesSet.Create(); pp_article.article_author_line = PP_author_line; PP_author_line = null; pp_article.article_title = PP_Tytul; if (PP_Tytul != String.Empty || PP_Tytul != " " || PP_Tytul != null) { var termTitlePP = TextPreparing.TermsPrepataions(PP_Tytul); document.Append(termTitlePP); } PP_Tytul = null; pp_article.article_source = PP_Zrodlo; if (PP_Zrodlo != String.Empty || PP_Zrodlo != " " || PP_Zrodlo != null) { var termSourcePP = TextPreparing.TermsPrepataions(PP_Zrodlo); document.Append(termSourcePP); } else { PP_Zrodlo = "Not defined"; document.Append(PP_Zrodlo); } PP_Zrodlo = null; pp_article.article_year = PP_Rok; PP_Rok = 0; pp_article.article_language = PP_Jezyk_Publikacji; PP_Jezyk_Publikacji = null; pp_article.article_DOI = PP_DOI; PP_DOI = null; /* * pp_article.article_details = PP_Uwagi; * PP_Uwagi = null; * pp_article.article_URL = PP_Adres_URL; * PP_Adres_URL = null; */ for (int z = 0; z <= PP_autors.Length - 4;) { var authors_of_the_PP_article = PPdbContext.AuthorSet.Create(); if (PP_autors[z] != "IC)") { authors_of_the_PP_article.author_name = PP_autors[z + 1]; authors_of_the_PP_article.author_surename = PP_autors[z]; pp_article.Author.Add(authors_of_the_PP_article); } z += 4; } PPdbContext.PP_ArticlesSet.Add(pp_article); var _document = document.ToString().Split(' ', ';', ':', ','); for (int k = 0; k <= _document.Length - 1; k++) { var terms = PPdbContext.Terms_Vocabulary.Create(); string dictionary_text = File.ReadAllText(@"F:\Magistry files\csv_files\Allowed_term_dictionary.csv"); string[] allowed_dictionary = dictionary_text.Split(',', '\n'); for (int d = 0; d <= _document.Length - 1; d++) { for (int j = 0; j <= allowed_dictionary.Length - 1; j++) { if (_document[d].Length > 3 && _document[d].Contains(allowed_dictionary[j])) { continue; } else if (_document[d].Length <= 3 && !(_document[d].Contains(allowed_dictionary[j]))) { _document.ToList().RemoveAt(d); } } } //tutaj potrzebnie przepisac id dokumenta w ktorym wystepuje dane slowo if (_document[k] != String.Empty || _document[k] != " " || _document[k] != null || _document[k] != Char.IsDigit(' ').ToString()) { //dbContext.Terms_Vocabulary.Where(u) var termVocabularyTable = PPdbContext.Terms_Vocabulary; terms.term_value = _document[k]; } pp_article.Terms_Vocabulary.Add(terms); } PPdbContext.SaveChanges(); } } catch (Exception ex) { File.WriteAllText(@"F:\\Magistry files\PP_crawler_Log.txt", ex.ToString()); } } else { File.WriteAllText(@"F:\\Magistry files\PP_crawler_Log.txt", "Empty line detected." + '\n'); } } else if (PP_separatedContent.Length == 2 && (PP_separatedContent[0].Contains("Liczba odnalezionych") || PP_separatedContent[0] == "Liczba odnalezionych rekordow")) { PP_articles_Count = Convert.ToInt32(PP_separatedContent[1]); PP_articles_Matrix = new string[PP_articles_Count]; for (int l = 0; l <= PP_articles_Count - 1; l++) { PP_articles_Matrix[l] = (l + 1) + "."; } } if (PP_separatedContent.Length == 2 && (PP_separatedContent[0].ToLower().Contains("autor") || PP_separatedContent[0].Contains("Autor") || PP_separatedContent[0] == "Autor")) { PP_author_line = PP_separatedContent[1]; var PP_author_line_modified = PP_author_line.Replace("(", String.Empty); PP_autors = PP_separatedContent[1].Split(autor_separators, StringSplitOptions.RemoveEmptyEntries); } else if (PP_separatedContent.Length == 2 && (PP_separatedContent[0].ToLower().Contains("tytu") || PP_separatedContent[0].ToLower().Contains("tytul") || PP_separatedContent[0].Contains("Tytul"))) { PP_Tytul = PP_separatedContent[1]; } else if (PP_separatedContent.Length == 2 && (PP_separatedContent[0].Contains("Zrodlo") || PP_separatedContent[0].ToLower().Contains("zrodlo"))) { PP_Zrodlo = PP_separatedContent[1]; } else if (PP_separatedContent.Length == 2 && (PP_separatedContent[0].Contains("Rok") || PP_separatedContent[0].ToLower().Contains("rok"))) { string rok = ""; if (PP_separatedContent[1] != "" | PP_separatedContent[1] == String.Empty) { rok = null; } else { rok = PP_separatedContent[1].Substring(0, 5); } PP_Rok = Convert.ToInt32(rok); } else if (PP_separatedContent.Length == 2 && (PP_separatedContent[0].Contains("Jezyk publikacji") || PP_separatedContent[0].ToLower().Contains("jezyk publikacji") || PP_separatedContent[0].Contains("Język publikacji") || PP_separatedContent[0].ToLower().Contains("język publikacji"))) { PP_Jezyk_Publikacji = PP_separatedContent[1]; } else if (PP_separatedContent.Length == 2 && (PP_separatedContent[0].Contains("DOI") || PP_separatedContent[0].ToLower().Contains("doi") || PP_separatedContent[0] == "DOI")) { PP_DOI = PP_separatedContent[1]; } p++; } #region Old_code /* 22.08.2018 - old version * for (int i = 0; i <= hapDoc.DocumentNode.InnerText.Length; i++) * { * PP_line = sr.ReadLine(); * int counter = 0; * if (PP_line != null) * { * PP_newcontent[i] = PP_line; * PP_separatedContent = PP_line.Split(line_separator,2); * * * if (PP_separatedContent.Length == 2 && (PP_separatedContent[0].ToLower().Contains("autor") || PP_separatedContent[0].Contains("Autor") || PP_separatedContent[0] == "Autor")) * { * //System.Windows.MessageBox.Show(PP_separatedContent[1]); * PP_author_line = PP_separatedContent[1]; * var PP_author_line_modified = PP_author_line.Replace("(", String.Empty); * * PP_autors = PP_separatedContent[1].Split(autor_separators, StringSplitOptions.RemoveEmptyEntries); * * } * else if (PP_separatedContent.Length == 2 && (PP_separatedContent[0].Contains("Liczba odnalezionych") || PP_separatedContent[0] == "Liczba odnalezionych rekordow")) * { * PP_articles_Count = Convert.ToInt32(PP_separatedContent[1]); * PP_articles_Matrix = new string[PP_articles_Count]; * for (int l = 0; l <= PP_articles_Count - 1; l++) * { * PP_articles_Matrix[l] = (l + 1) + "."; * } * } * else if (PP_separatedContent.Length == 1 && PP_articles_Matrix.Any(x => PP_separatedContent[0].Contains(x))) * { * if (PP_author_line != null && PP_Tytul != null) * { * ///<summary> * ///PPArticle_Entity_Object_creation_Model_first * /// </summary> * try * { #region PP_Article_Object_creation_Model_First * using (var PPdbContext = new ArticleDBDataModelContainer()) * { * var document = new StringBuilder(); * var pp_article = PPdbContext.PP_ArticlesSet.Create(); * * pp_article.article_author_line = PP_author_line; * PP_author_line = null; * * pp_article.article_title = PP_Tytul; * if (PP_Tytul != String.Empty || PP_Tytul != " " || PP_Tytul != null) * { * var termTitlePP = TextPreparing.TermsPrepataions(PP_Tytul); * document.Append(termTitlePP); * } * PP_Tytul = null; * * pp_article.article_source = PP_Zrodlo; * if (PP_Zrodlo != String.Empty || PP_Zrodlo != " " || PP_Zrodlo != null) * { * var termSourcePP = TextPreparing.TermsPrepataions(PP_Zrodlo); * document.Append(termSourcePP); * } * PP_Zrodlo = null; * * pp_article.article_year = PP_Rok; * PP_Rok = 0; * pp_article.article_language = PP_Jezyk_Publikacji; * PP_Jezyk_Publikacji = null; * pp_article.article_DOI = PP_DOI; * PP_DOI = null; * // * pp_article.article_details = PP_Uwagi; * PP_Uwagi = null; * pp_article.article_URL = PP_Adres_URL; * PP_Adres_URL = null; * // * * for (int z = 0; z <= PP_autors.Length - 4;) * { * var authors_of_the_PP_article = PPdbContext.AuthorSet.Create(); * if (PP_autors[z] != "IC)") * { * authors_of_the_PP_article.author_name = PP_autors[z + 1]; * authors_of_the_PP_article.author_surename = PP_autors[z]; * pp_article.Author.Add(authors_of_the_PP_article); * } * z += 4; * } * PPdbContext.PP_ArticlesSet.Add(pp_article); * * var _document = document.ToString().Split(' ', ';', ':', ','); * for (int k = 0; k <= _document.Length - 1; k++) * { * var terms = PPdbContext.Terms_Vocabulary.Create(); * * // * string dictionary_text = File.ReadAllText(@"F:\Magistry files\csv_files\Allowed_term_dictionary.csv"); * string[] allowed_dictionary = dictionary_text.Split(',', '\n'); * * for (int p = 0; p <= _document.Length - 1; p++) * { * for (int j = 0; j <= allowed_dictionary.Length - 1; j++) * { * if (_document[p].Length > 3 && _document[p].Contains(allowed_dictionary[j])) * { * continue; * } * else if (_document[p].Length <= 3 && !(_document[p].Contains(allowed_dictionary[j]))) * { * _document.ToList().RemoveAt(p); * } * * } * } * //tutaj potrzebnie przepisac id dokumenta w ktorym wystepuje dane slowo * if (_document[k] != String.Empty || _document[k] != " " || _document[k] != null || _document[k] != Char.IsDigit(' ').ToString()) * { * //dbContext.Terms_Vocabulary.Where(u) * var termVocabularyTable = PPdbContext.Terms_Vocabulary; * terms.term_value = _document[k]; * * } * pp_article.Terms_Vocabulary.Add(terms); * } * * PPdbContext.SaveChanges(); * } #endregion * } * catch (Exception ex) * { * File.WriteAllText(@"F:\\Magistry files\PP_crawler_Log.txt", ex.ToString()); * } * ///<summary> * /// PPArticle_Entity_Object_Creation * /// </summary> #region PPArticle_Entity_Object_Creation * /* * using (var dbppcontext = new PublicationsContext()) * { * var pp_article = new PPArticle(); * pp_article.article_author_line = PP_author_line; * PP_author_line = null; * pp_article.article_title = PP_Tytul; * PP_Tytul = null; * pp_article.article_source = PP_Zrodlo; * PP_Zrodlo = null; * pp_article.article_year = PP_Rok; * PP_Rok = 0; * pp_article.article_language = PP_Jezyk_Publikacji; * PP_Jezyk_Publikacji = null; * pp_article.article_DOI = PP_DOI; * PP_DOI = null; * pp_article.article_details = PP_Uwagi; * PP_Uwagi = null; * pp_article.article_URL = PP_Adres_URL; * PP_Adres_URL = null; * * * * var authors_of_the_article = new Authors(); * for (int k = 0; k <= PP_autors.Length - 2; k++) * { * authors_of_the_article.author_name = PP_autors[k]; * authors_of_the_article.author_surename = PP_autors[k + 1]; * dbppcontext.Authors.Add(authors_of_the_article); * * } * //dbppcontext.PP_Articles.Add(pp_article); * dbppcontext.PP_Articles.Attach(pp_article); * dbppcontext.Entry(pp_article).State = System.Data.Entity.EntityState.Added; * dbppcontext.SaveChanges(); * //dbppcontext.SaveChanges(); * } * // * //#endregion * } * else * { * //System.Windows.MessageBox.Show("Brak danych"); * } * * } * else if (PP_separatedContent.Length == 2 && (PP_separatedContent[0].ToLower().Contains("tytu") || PP_separatedContent[0].ToLower().Contains("tytul") || PP_separatedContent[0].Contains("Tytul"))) * { * PP_Tytul = PP_separatedContent[1]; * //System.Windows.MessageBox.Show(PP_Tytul); * } * else if (PP_separatedContent.Length == 2 && (PP_separatedContent[0].Contains("Zrodlo") || PP_separatedContent[0].ToLower().Contains("zrodlo"))) * { * PP_Zrodlo = PP_separatedContent[1]; * //System.Windows.MessageBox.Show(PP_Zrodlo); * } * else if (PP_separatedContent.Length == 2 && (PP_separatedContent[0].Contains("Rok") || PP_separatedContent[0].ToLower().Contains("rok"))) * { * var rok = PP_separatedContent[1].Substring(0, 5); * PP_Rok = Convert.ToInt32(rok); * //System.Windows.MessageBox.Show(PP_Rok.ToString()); * } * else if (PP_separatedContent.Length == 2 && (PP_separatedContent[0].Contains("Jezyk publikacji") || PP_separatedContent[0].ToLower().Contains("jezyk publikacji") || PP_separatedContent[0].Contains("Język publikacji") || PP_separatedContent[0].ToLower().Contains("język publikacji"))) * { * PP_Jezyk_Publikacji = PP_separatedContent[1]; * //System.Windows.MessageBox.Show(PP_Jezyk_Publikacji); * } * else if (PP_separatedContent.Length == 2 && (PP_separatedContent[0].Contains("DOI") || PP_separatedContent[0].ToLower().Contains("doi") || PP_separatedContent[0] == "DOI")) * { * PP_DOI = PP_separatedContent[1]; * //System.Windows.MessageBox.Show(PP_DOI); * } * /* * else if (PP_separatedContent.Length == 2 && (PP_separatedContent[0].Contains("Uwagi") || PP_separatedContent[0].ToLower().Contains("uwagi") || PP_separatedContent[0] == "Uwagi")) * { * PP_Uwagi = PP_separatedContent[1]; * System.Windows.MessageBox.Show(PP_Uwagi); * } * else if (PP_separatedContent.Length == 2 && (PP_separatedContent[0].Contains("Adres url") || PP_separatedContent[0].ToLower().Contains("adres url") || PP_separatedContent[0] == "Adres url")) * { * PP_Adres_URL = PP_separatedContent[1]; * System.Windows.MessageBox.Show(PP_Adres_URL = PP_separatedContent[1]); * } * // * * //else if (PP_separatedContent.Length == 1 && PP_separatedContent[0] == String.Empty) System.Windows.MessageBox.Show("The empty line detected", "Empty line", System.Windows.MessageBoxButton.OK); * else * { * //System.Windows.MessageBox.Show("Error! Content not found!", "Error!", System.Windows.MessageBoxButton.OK); * * } * counter++; * } * } */ #endregion } }
public static void get_UMK_Document_content() { string[] UMK_newcontent = new string[hapDoc.DocumentNode.InnerText.Length]; string[] UMK_separatedContent = new string[hapDoc.DocumentNode.InnerText.Length]; UMK_articles_Count = 0; string[] PP_articles_Matrix = { String.Empty }; using (StringReader sr = new StringReader(endText)) { string UMK_line; for (int i = 0; i <= hapDoc.DocumentNode.InnerText.Length; i++) { UMK_line = sr.ReadLine(); if (UMK_line != null) { UMK_newcontent[i] = UMK_line; UMK_separatedContent = UMK_line.Split(line_separator, 2); //tutaj idzie funkcjonalnosc if (UMK_separatedContent.Length == 1 & UMK_separatedContent[0] == "") { continue; } else if (UMK_separatedContent.Length == 1 & PP_articles_Matrix.Any(x => UMK_separatedContent[0].Contains(x))) { if (UMK_author_line != null && UMK_Tytul != null) { using (var dbContext = new ArticleDBDataModelContainer()) { var document = new StringBuilder(); var umk_article = dbContext.UMK_ArticlesSet.Create(); if (UMK_author_line == null) { UMK_author_line = "Not_defined"; } umk_article.article_author_line = UMK_author_line; UMK_author_line = null; if (UMK_Tytul == null) { UMK_Tytul = "Not_defined"; } umk_article.article_title = UMK_Tytul; if (UMK_Tytul != String.Empty | UMK_Tytul != " " | UMK_Tytul != null) { var termTitle_UMK = TextPreparing.TermsPrepataions(UMK_Tytul); document.Append(termTitle_UMK); } UMK_Tytul = null; if (UMK_Pelny_tytul_czasop == null) { UMK_Pelny_tytul_czasop = "Not_defined"; } umk_article.article_Full_title = UMK_Pelny_tytul_czasop; if (UMK_Pelny_tytul_czasop != String.Empty | UMK_Pelny_tytul_czasop != " " | UMK_Pelny_tytul_czasop != null) { var termFullTitle_UMK = TextPreparing.TermsPrepataions(UMK_Pelny_tytul_czasop); document.Append(termFullTitle_UMK); } UMK_Pelny_tytul_czasop = null; if (UMK_Jezyk_Publikacji == null) { UMK_Jezyk_Publikacji = "Not_defined"; } umk_article.article_language = UMK_Jezyk_Publikacji; UMK_Jezyk_Publikacji = null; if (UMK_Tytul_rownolegly == null) { UMK_Tytul_rownolegly = "Not_defined"; } umk_article.article_translated_title = UMK_Tytul_rownolegly; if (UMK_Tytul_rownolegly != String.Empty | UMK_Tytul_rownolegly != " " | UMK_Tytul_rownolegly != null) { var termParallelTitle_UMK = TextPreparing.TermsPrepataions(UMK_Tytul_rownolegly); document.Append(termParallelTitle_UMK); } UMK_Tytul_rownolegly = null; if (UMK_en_keywords_line == null) { UMK_en_keywords_line = "Not_defined"; } umk_article.article_eng_keywords = UMK_en_keywords_line; if (UMK_en_keywords_line != String.Empty | UMK_en_keywords_line != " " | UMK_en_keywords_line != null) { var term_Eng_Keywords_UMK = TextPreparing.TermsPrepataions(UMK_en_keywords_line); document.Append(term_Eng_Keywords_UMK); } UMK_en_keywords_line = null; if (UMK_pl_keywords_line == null) { UMK_pl_keywords_line = "Not_defined"; } umk_article.article_pl_keywords = UMK_pl_keywords_line; if (UMK_pl_keywords_line != String.Empty | UMK_pl_keywords_line != " " | UMK_pl_keywords_line != null) { var term_PL_Keywords_UMK = TextPreparing.TermsPrepataions(UMK_pl_keywords_line); document.Append(term_PL_Keywords_UMK); } UMK_pl_keywords_line = null; if (UMK_Adres_URL == null) { UMK_Adres_URL = "Not_defined"; } umk_article.article_url = UMK_Adres_URL; UMK_Adres_URL = null; if (UMK_Tytul_Wydawn_Zbior == null) { UMK_Tytul_Wydawn_Zbior = "Not_defined"; } umk_article.article_publisher_title = UMK_Tytul_Wydawn_Zbior; UMK_Tytul_Wydawn_Zbior = null; if (UMK_Opis_wydawn == null) { UMK_Opis_wydawn = "Not_defined"; } umk_article.article_publisher_desc = UMK_Opis_wydawn; UMK_Opis_wydawn = null; for (int k = 0; k <= UMK_autors.Length - 2;) { var authors_of_the_article = dbContext.AuthorSet.Create(); authors_of_the_article.author_name = UMK_autors[k]; authors_of_the_article.author_surename = UMK_autors[k + 1]; umk_article.Author.Add(authors_of_the_article); k += 2; } dbContext.UMK_ArticlesSet.Add(umk_article); //dbContext.Configuration.ValidateOnSaveEnabled = false; var _document = document.ToString().Split(' ', ';', ':', ','); for (int k = 0; k <= _document.Length - 1; k++) { var terms = dbContext.Terms_Vocabulary.Create(); string dictionary_text = File.ReadAllText(@"F:\Magistry files\csv_files\Allowed_term_dictionary.csv"); string[] allowed_dictionary = dictionary_text.Split(',', '\n'); for (int p = 0; p <= _document.Length - 1; p++) { for (int j = 0; j <= allowed_dictionary.Length - 1; j++) { if (_document[p].Length > 3 && _document[p].Contains(allowed_dictionary[j])) { continue; } else if (_document[p].Length <= 3 && !(_document[p].Contains(allowed_dictionary[j]))) { _document.ToList().RemoveAt(p); } } } //tutaj potrzebnie przepisac id dokumenta w ktorym wystepuje dane slowo if (_document[k] != String.Empty | _document[k] != " " | _document[k] != null | _document[k] != Char.IsDigit(' ').ToString()) { //dbContext.Terms_Vocabulary.Where(u) var termVocabularyTable = dbContext.Terms_Vocabulary; terms.term_value = _document[k]; } umk_article.Terms_Vocabulary.Add(terms); } try { dbContext.SaveChanges(); } catch (Exception ex) { File.WriteAllText(@"F:\\Magistry files\UMK_crawler_Log.txt", ex.ToString()); } } } else { continue; } } else if (UMK_separatedContent.Length == 1 & (UMK_separatedContent[0].ToLower().Contains("http://") | UMK_separatedContent[0].ToLower().Contains("https://") | UMK_separatedContent[0].Contains("http://") | UMK_separatedContent[0].Contains("https://"))) { UMK_Adres_URL = UMK_separatedContent[0]; } else if (UMK_separatedContent.Length == 2 & (UMK_separatedContent[0].ToLower().Contains("aut.") | UMK_separatedContent[0].Contains("Aut.") | UMK_separatedContent[0] == "Aut.")) { UMK_autors = UMK_separatedContent[1].Split(separators, StringSplitOptions.RemoveEmptyEntries); UMK_author_line = UMK_separatedContent[1]; } else if (UMK_separatedContent.Length == 2 & (UMK_separatedContent[0].ToLower().Contains("Zapyt") | UMK_separatedContent[0].Contains("zapyt") | UMK_separatedContent[0] == "Zapytanie" | UMK_separatedContent[0] == "zapytanie")) { UMK_Zapytanie_Wyszukiwania = "SELECT * FROM UMK_Splendor_Expertus_article_database WHERE article LIKE " + UMK_separatedContent[1]; } else if (UMK_separatedContent.Length == 2 & (UMK_separatedContent[0].Contains("Liczba odnalezionych") | UMK_separatedContent[0] == "Liczba odnalezionych rekordow")) { UMK_articles_Count = Convert.ToInt32(UMK_separatedContent[1]); PP_articles_Matrix = new string[UMK_articles_Count]; for (int z = 0; z <= UMK_articles_Count - 1; z++) { PP_articles_Matrix[z] = (z + 1) + "."; } } else if (UMK_separatedContent.Length >= 2 & (UMK_separatedContent[0].ToLower().Contains("tytu") | UMK_separatedContent[0].ToLower().Contains("tytuł") | UMK_separatedContent[0].ToLower().Contains("tytul") | UMK_separatedContent[0].Contains("TYTUŁ") | UMK_separatedContent[0] == "Tytuł" | UMK_separatedContent[0] == "Tytul")) { UMK_Tytul = UMK_separatedContent[1]; } else if (UMK_separatedContent.Length >= 2 & (UMK_separatedContent[0].ToLower().Contains("opis wydawn.") | UMK_separatedContent[0].ToLower().Contains("opis wydawn") | UMK_separatedContent[0].Contains("Opis wydawn.") | UMK_separatedContent[0].Contains("Opis wydawn") | UMK_separatedContent[0] == "Opis wydawn.")) { UMK_Opis_wydawn = UMK_separatedContent[1]; } else if (UMK_separatedContent.Length == 2 & (UMK_separatedContent[0].ToLower().Contains("język") | UMK_separatedContent[0].ToLower().Contains("jezyk") | UMK_separatedContent[0].Contains("Język") | UMK_separatedContent[0].Contains("Jezyk") | UMK_separatedContent[0] == "Język" | UMK_separatedContent[0] == "Jezyk")) { UMK_Jezyk_Publikacji = UMK_separatedContent[1]; } else if (UMK_separatedContent.Length == 2 & (UMK_separatedContent[0].ToLower().Contains("polskie słowa kluczowe") | UMK_separatedContent[0].ToLower().Contains("polskie slowa kluczowe") | UMK_separatedContent[0].Contains("Polskie słowa kluczowe") | UMK_separatedContent[0].Contains("Polskie slowa kluczowe") | UMK_separatedContent[0].Contains("Polskie slo") | UMK_separatedContent[0].Contains("polskie slo") | UMK_separatedContent[0] == "Polskie słowa kluczowe" | UMK_separatedContent[0] == "Polskie slowa kluczowe")) { UMK_Slowa_kluczowe_j_pl = UMK_separatedContent[1].Split(separators); UMK_pl_keywords_line = UMK_separatedContent[1]; } else if (UMK_separatedContent.Length == 2 & (UMK_separatedContent[0].ToLower().Contains("tytuł wydawn. zbior.") | UMK_separatedContent[0].ToLower().Contains("tytul wydawn. zbior.") | UMK_separatedContent[0].Contains("Tytuł wydawn. zbior.") | UMK_separatedContent[0].Contains("Tytul wydawn. zbior.") | UMK_separatedContent[0] == "Tytuł wydawn. zbior." | UMK_separatedContent[0] == "Tytul wydawn. zbior.")) { UMK_Tytul_Wydawn_Zbior = UMK_separatedContent[1]; } else if ((UMK_separatedContent.Length == 2 & (UMK_separatedContent[0].ToLower().Contains("pełny tytuł czasop.") | UMK_separatedContent[0].ToLower().Contains("pelny tytul czasop.") | UMK_separatedContent[0].Contains("Pełny tytuł czasop.") | UMK_separatedContent[0].Contains("Pelny tytul czasop.") | UMK_separatedContent[0] == "Pełny tytuł czasop." | UMK_separatedContent[0] == "Pelny tytul czasop."))) { UMK_Pelny_tytul_czasop = UMK_separatedContent[1]; } else if ((UMK_separatedContent.Length == 2 & (UMK_separatedContent[0].ToLower().Contains("tytuł równoległy") | UMK_separatedContent[0].ToLower().Contains("Tytul rownolegly") | UMK_separatedContent[0] == "Tytuł równoległy" | UMK_separatedContent[0] == "Tytul rownolegly"))) { UMK_Tytul_rownolegly = UMK_separatedContent[1]; } else if (UMK_separatedContent.Length == 2 & (UMK_separatedContent[0].ToLower().Contains("angielskie słowa kluczowe") | UMK_separatedContent[0].ToLower().Contains("angielskie slowa kluczowe") | UMK_separatedContent[0].Contains("Angielskie słowa kluczowe") | UMK_separatedContent[0].Contains("angielskie słowa kluczowe ") | UMK_separatedContent[0] == "Angielskie słowa kluczowe" | UMK_separatedContent[0] == "angielskie słowa kluczowe")) { UMK_Slowa_kluczowe_j_ang = UMK_separatedContent[1].Split(separators); UMK_en_keywords_line = UMK_separatedContent[1]; } } } } }
//potrzebnie zaimplementowac divide and conquer dla duzych plikow public static void get_WSB_Document_content() { string[] WSB_newcontent = new string[hapDoc.DocumentNode.InnerText.Length]; string[] WSB_separatedContent = new string[hapDoc.DocumentNode.InnerText.Length]; WSB_articles_Count = 0; string[] WSB_articles_Matrix = { String.Empty }; using (StringReader sr = new StringReader(endText)) { int p = 0; string WSB_line; // 22.08.2018 New version of reader while ((WSB_line = sr.ReadLine()) != null) { WSB_newcontent[p] = WSB_line; WSB_separatedContent = WSB_line.Split(line_separator, 2); if (WSB_separatedContent.Length == 1 & WSB_separatedContent[0] == "") { continue; } else if (WSB_separatedContent.Length == 1 & WSB_articles_Matrix.Any(x => WSB_separatedContent[0].Contains(x))) { if (WSB_author_line != null & WSB_Tytul_pracy != null) { using (var dbContext = new ArticleDBDataModelContainer()) { var document = new StringBuilder(); var wsb_article = dbContext.WSB_ArticlesSet.Create(); if (WSB_author_line == null) { WSB_author_line = "Not_defined"; } wsb_article.article_authors = WSB_author_line; WSB_author_line = null; if (WSB_Tytul_pracy == null) { WSB_Tytul_pracy = "Not_defined"; } wsb_article.article_title = WSB_Tytul_pracy; if (WSB_Tytul_pracy != String.Empty | WSB_Tytul_pracy != " " | WSB_Tytul_pracy != null) { var termTitle_WSB = TextPreparing.TermsPrepataions(WSB_Tytul_pracy); document.Append(termTitle_WSB); } WSB_Tytul_pracy = null; if (WSB_Adres_wydawniczy == null) { WSB_Adres_wydawniczy = "Not_defined"; } wsb_article.article_publisher_adres = WSB_Adres_wydawniczy; WSB_Adres_wydawniczy = null; if (WSB_Tytul_calosci == null) { WSB_Tytul_calosci = "Not_defined"; } wsb_article.article_common_title = WSB_Tytul_calosci; if (WSB_Tytul_calosci != String.Empty | WSB_Tytul_calosci != " " | WSB_Tytul_calosci != null) { var termFullTitle_WSB = TextPreparing.TermsPrepataions(WSB_Tytul_calosci); document.Append(termFullTitle_WSB); } WSB_Tytul_calosci = null; if (WSB_Slowa_kluczowe_j_pl_line == null) { WSB_Slowa_kluczowe_j_pl_line = "Not_defined"; } wsb_article.article_pl_keywords = WSB_Slowa_kluczowe_j_pl_line; if (WSB_Slowa_kluczowe_j_pl_line != String.Empty | WSB_Slowa_kluczowe_j_pl_line != " " | WSB_Slowa_kluczowe_j_pl_line != null) { var term_PL_Keywords_WSB = TextPreparing.TermsPrepataions(WSB_Slowa_kluczowe_j_pl_line); document.Append(term_PL_Keywords_WSB); } WSB_Slowa_kluczowe_j_pl_line = null; if (WSB_Slowa_kluczowe_j_ang_line == null) { WSB_Slowa_kluczowe_j_ang_line = "Not_defined"; } wsb_article.article_eng_keywords = WSB_Slowa_kluczowe_j_ang_line; if (WSB_Slowa_kluczowe_j_ang_line != String.Empty | WSB_Slowa_kluczowe_j_ang_line != " " | WSB_Slowa_kluczowe_j_ang_line != null) { var term_Eng_Keywords_WSB = TextPreparing.TermsPrepataions(WSB_Slowa_kluczowe_j_ang_line); document.Append(term_Eng_Keywords_WSB); } WSB_Slowa_kluczowe_j_ang_line = null; if (WSB_Tytul_pracy_w_innym_j == null) { WSB_Tytul_pracy_w_innym_j = "Not_defined"; } wsb_article.article_title_other_lang = WSB_Tytul_pracy_w_innym_j; if (WSB_Tytul_pracy_w_innym_j != String.Empty | WSB_Tytul_pracy_w_innym_j != " " | WSB_Tytul_pracy_w_innym_j != null) { var term_Title_Other_Lang_WSB = TextPreparing.TermsPrepataions(WSB_Tytul_pracy_w_innym_j); document.Append(term_Title_Other_Lang_WSB); } WSB_Tytul_pracy_w_innym_j = null; if (WSB_Szczegoly == null) { WSB_Szczegoly = "Not_defined"; } wsb_article.article_details = WSB_Szczegoly; WSB_Szczegoly = null; if (WSB_URL == null) { WSB_URL = "Not_defined"; } wsb_article.article_URL = WSB_URL; WSB_URL = null; if (WSB_DOI == null) { WSB_DOI = "Not_defined"; } wsb_article.article_DOI = WSB_DOI; WSB_DOI = null; for (int k = 0; k <= WSB_autors.Length - 2;) { var authors_of_the_article = dbContext.AuthorSet.Create(); authors_of_the_article.author_name = WSB_autors[k]; authors_of_the_article.author_surename = WSB_autors[k + 1]; wsb_article.Author.Add(authors_of_the_article); k += 2; } dbContext.WSB_ArticlesSet.Add(wsb_article); var _document = document.ToString().Split(' ', ';', ':', ','); for (int k = 0; k <= _document.Length - 1; k++) { var terms = dbContext.Terms_Vocabulary.Create(); string dictionary_text = File.ReadAllText(@"F:\Magistry files\csv_files\Allowed_term_dictionary.csv"); string[] allowed_dictionary = dictionary_text.Split(',', '\n'); for (int d = 0; d <= _document.Length - 1; d++) { for (int j = 0; j <= allowed_dictionary.Length - 1; j++) { if (_document[d].Length > 3 & _document[d].Contains(allowed_dictionary[j])) { continue; } else if (_document[d].Length <= 3 & !(_document[d].Contains(allowed_dictionary[j]))) { _document.ToList().RemoveAt(d); } } } //tutaj potrzebnie przepisac id dokumenta w ktorym wystepuje dane slowo if (_document[k] != String.Empty | _document[k] != " " | _document[k] != null | _document[k] != Char.IsDigit(' ').ToString()) { //dbContext.Terms_Vocabulary.Where(u) var termVocabularyTable = dbContext.Terms_Vocabulary; terms.term_value = _document[k]; } wsb_article.Terms_Vocabulary.Add(terms); } try { dbContext.SaveChanges(); } catch (Exception ex) { File.WriteAllText(@"F:\\Magistry files\WSB_crawler_Log.txt", ex.ToString()); } } } else { continue; } } else if (WSB_separatedContent.Length == 2 & (WSB_separatedContent[0].ToLower().Contains("autor") | WSB_separatedContent[0].Contains("Autor") | WSB_separatedContent[0] == "Autorzy")) { WSB_autors = WSB_separatedContent[1].Split(autor_separators, StringSplitOptions.RemoveEmptyEntries); WSB_author_line = WSB_separatedContent[1]; } else if (WSB_separatedContent.Length == 2 & (WSB_separatedContent[0].ToLower().Contains("tytul pracy") | WSB_separatedContent[0].Contains("Tytul pracy") | WSB_separatedContent[0] == "Tytul pracy")) { WSB_Tytul_pracy = WSB_separatedContent[1]; } else if (WSB_separatedContent.Length == 2 & (WSB_separatedContent[0].Contains("Liczba odnalezionych") | WSB_separatedContent[0] == "Liczba odnalezionych rekordow")) { WSB_articles_Count = Convert.ToInt32(WSB_separatedContent[1]); WSB_articles_Matrix = new string[WSB_articles_Count]; for (int z = 0; z <= WSB_articles_Count - 1; z++) { WSB_articles_Matrix[z] = (z + 1) + "."; } } else if (WSB_separatedContent.Length == 2 & (WSB_separatedContent[0].ToLower().Contains("adres wydawniczy") | WSB_separatedContent[0].Contains("Adres wydawniczy") | WSB_separatedContent[0] == "Adres wydawniczy")) { WSB_Adres_wydawniczy = WSB_separatedContent[1]; } else if (WSB_separatedContent.Length == 2 & (WSB_separatedContent[0].ToLower().Contains("polskie hasla") | WSB_separatedContent[0].Contains("Polskie hasla") | WSB_separatedContent[0] == "Polskie hasla przedmiotowe")) { WSB_Slowa_kluczowe_j_pl = WSB_separatedContent[1].Split(separators, StringSplitOptions.RemoveEmptyEntries); WSB_Slowa_kluczowe_j_pl_line = WSB_separatedContent[1]; } else if (WSB_separatedContent.Length == 2 & (WSB_separatedContent[0].ToLower().Contains("angielskie hasla") | WSB_separatedContent[0].Contains("Angielskie hasla") | WSB_separatedContent[0] == "Angielskie hasla przedmiotowe")) { WSB_Slowa_kluczowe_j_ang = WSB_separatedContent[1].Split(separators, StringSplitOptions.RemoveEmptyEntries); WSB_Slowa_kluczowe_j_ang_line = WSB_separatedContent[1]; } else if (WSB_separatedContent.Length == 2 & (WSB_separatedContent[0].ToLower().Contains("tytul calosci") | WSB_separatedContent[0].Contains("Tytul calosci") | WSB_separatedContent[0] == "Tytul calosci")) { WSB_Tytul_calosci = WSB_separatedContent[1]; } else if (WSB_separatedContent.Length == 2 & (WSB_separatedContent[0].ToLower().Contains("doi") | WSB_separatedContent[0].Contains("DOI") | WSB_separatedContent[0] == "DOI")) { WSB_DOI = WSB_separatedContent[1]; } else if (WSB_separatedContent.Length == 2 & (WSB_separatedContent[0].ToLower().Contains("tytul pracy w innym") | WSB_separatedContent[0].Contains("Tytul pracy w innym") | WSB_separatedContent[0] == "Tytul pracy w innym jezyku")) { WSB_Tytul_pracy_w_innym_j = WSB_separatedContent[1]; } else if (WSB_separatedContent.Length == 2 & (WSB_separatedContent[0].ToLower().Contains("szczegoly") | WSB_separatedContent[0].Contains("Szczegoly") | WSB_separatedContent[0] == "Szczegoly")) { WSB_Szczegoly = WSB_separatedContent[1]; } else if (WSB_separatedContent.Length == 2 & (WSB_separatedContent[0].ToLower().Contains("url") | WSB_separatedContent[0].Contains("Url") | WSB_separatedContent[0] == "Adres url")) { WSB_URL = WSB_separatedContent[1]; } p++; } #region Old_iteration_method /* -- 21.08.2018 Old wersion of iteration * for (int i = 0; i <= hapDoc.DocumentNode.InnerText.Length; i++) * { * WSB_line = sr.ReadLine(); * if (WSB_line != null) * { * WSB_newcontent[i] = WSB_line; * WSB_separatedContent = WSB_line.Split(line_separator, 2); * * * if (WSB_separatedContent.Length == 2 & (WSB_separatedContent[0].ToLower().Contains("autor") | WSB_separatedContent[0].Contains("Autor") | WSB_separatedContent[0] == "Autorzy")) * { * //System.Windows.MessageBox.Show(WSB_separatedContent[1]); * WSB_autors = WSB_separatedContent[1].Split(autor_separators, StringSplitOptions.RemoveEmptyEntries); * WSB_author_line = WSB_separatedContent[1]; * } * * else if (WSB_separatedContent.Length == 2 & (WSB_separatedContent[0].Contains("Liczba odnalezionych") | WSB_separatedContent[0] == "Liczba odnalezionych rekordow")) * { * WSB_articles_Count = Convert.ToInt32(WSB_separatedContent[1]); * WSB_articles_Matrix = new string[WSB_articles_Count]; * for (int z = 0; z <= WSB_articles_Count - 1; z++) * { * WSB_articles_Matrix[z] = (z + 1) + "."; * } * } * * else if (WSB_separatedContent.Length == 1 & WSB_articles_Matrix.Any(x => WSB_separatedContent[0].Contains(x))) * { * if (WSB_author_line != null & WSB_Tytul_pracy != null) * { * using(var dbContext = new ArticleDBDataModelContainer()) * { * var document = new StringBuilder(); * var wsb_article = dbContext.WSB_ArticlesSet.Create(); * * if (WSB_author_line == null) * { * WSB_author_line = "Not_defined"; * } * wsb_article.article_authors = WSB_author_line; * WSB_author_line = null; * * if (WSB_Tytul_pracy == null) * { * WSB_Tytul_pracy = "Not_defined"; * } * wsb_article.article_title = WSB_Tytul_pracy; * if (WSB_Tytul_pracy != String.Empty | WSB_Tytul_pracy != " " | WSB_Tytul_pracy != null) * { * var termTitle_WSB = TextPreparing.TermsPrepataions(WSB_Tytul_pracy); * document.Append(termTitle_WSB); * } * WSB_Tytul_pracy = null; * * if (WSB_Adres_wydawniczy == null) * { * WSB_Adres_wydawniczy = "Not_defined"; * } * wsb_article.article_publisher_adres = WSB_Adres_wydawniczy; * WSB_Adres_wydawniczy = null; * * if (WSB_Tytul_calosci == null) * { * WSB_Tytul_calosci = "Not_defined"; * } * wsb_article.article_common_title = WSB_Tytul_calosci; * if (WSB_Tytul_calosci != String.Empty | WSB_Tytul_calosci != " " | WSB_Tytul_calosci != null) * { * var termFullTitle_WSB = TextPreparing.TermsPrepataions(WSB_Tytul_calosci); * document.Append(termFullTitle_WSB); * } * WSB_Tytul_calosci = null; * * if (WSB_Slowa_kluczowe_j_pl_line == null) * { * WSB_Slowa_kluczowe_j_pl_line = "Not_defined"; * } * wsb_article.article_pl_keywords = WSB_Slowa_kluczowe_j_pl_line; * if (WSB_Slowa_kluczowe_j_pl_line != String.Empty | WSB_Slowa_kluczowe_j_pl_line != " " | WSB_Slowa_kluczowe_j_pl_line != null) * { * var term_PL_Keywords_WSB = TextPreparing.TermsPrepataions(WSB_Slowa_kluczowe_j_pl_line); * document.Append(term_PL_Keywords_WSB); * } * WSB_Slowa_kluczowe_j_pl_line = null; * * if (WSB_Slowa_kluczowe_j_ang_line == null) * { * WSB_Slowa_kluczowe_j_ang_line = "Not_defined"; * } * wsb_article.article_eng_keywords = WSB_Slowa_kluczowe_j_ang_line; * if (WSB_Slowa_kluczowe_j_ang_line != String.Empty | WSB_Slowa_kluczowe_j_ang_line != " " | WSB_Slowa_kluczowe_j_ang_line != null) * { * var term_Eng_Keywords_WSB = TextPreparing.TermsPrepataions(WSB_Slowa_kluczowe_j_ang_line); * document.Append(term_Eng_Keywords_WSB); * } * WSB_Slowa_kluczowe_j_ang_line = null; * * if (WSB_Tytul_pracy_w_innym_j == null) * { * WSB_Tytul_pracy_w_innym_j = "Not_defined"; * } * wsb_article.article_title_other_lang = WSB_Tytul_pracy_w_innym_j; * if (WSB_Tytul_pracy_w_innym_j != String.Empty | WSB_Tytul_pracy_w_innym_j != " " | WSB_Tytul_pracy_w_innym_j != null) * { * var term_Title_Other_Lang_WSB = TextPreparing.TermsPrepataions(WSB_Tytul_pracy_w_innym_j); * document.Append(term_Title_Other_Lang_WSB); * } * WSB_Tytul_pracy_w_innym_j = null; * * if (WSB_Szczegoly == null) * { * WSB_Szczegoly = "Not_defined"; * } * wsb_article.article_details = WSB_Szczegoly; * WSB_Szczegoly = null; * * if (WSB_URL == null) * { * WSB_URL = "Not_defined"; * } * wsb_article.article_URL = WSB_URL; * WSB_URL = null; * * if (WSB_DOI == null) * { * WSB_DOI = "Not_defined"; * } * wsb_article.article_DOI = WSB_DOI; * WSB_DOI = null; * * * for (int k = 0; k <= WSB_autors.Length - 2;) * { * var authors_of_the_article = dbContext.AuthorSet.Create(); * authors_of_the_article.author_name = WSB_autors[k]; * authors_of_the_article.author_surename = WSB_autors[k + 1]; * wsb_article.Author.Add(authors_of_the_article); * k += 2; * } * * dbContext.WSB_ArticlesSet.Add(wsb_article); * * var _document = document.ToString().Split(' ', ';', ':', ','); * for (int k = 0; k <= _document.Length - 1; k++) * { * var terms = dbContext.Terms_Vocabulary.Create(); * // * string dictionary_text = File.ReadAllText(@"F:\Magistry files\csv_files\Allowed_term_dictionary.csv"); * string[] allowed_dictionary = dictionary_text.Split(',', '\n'); * * for (int p = 0; p <= _document.Length - 1; p++) * { * for (int j = 0; j <= allowed_dictionary.Length - 1; j++) * { * if (_document[p].Length > 3 & _document[p].Contains(allowed_dictionary[j])) * { * continue; * } * else if (_document[p].Length <= 3 & !(_document[p].Contains(allowed_dictionary[j]))) * { * _document.ToList().RemoveAt(p); * } * * } * } * * //tutaj potrzebnie przepisac id dokumenta w ktorym wystepuje dane slowo * if (_document[k] != String.Empty | _document[k] != " " | _document[k] != null | _document[k] != Char.IsDigit(' ').ToString()) * { * //dbContext.Terms_Vocabulary.Where(u) * var termVocabularyTable = dbContext.Terms_Vocabulary; * terms.term_value = _document[k]; * * } * wsb_article.Terms_Vocabulary.Add(terms); * } * try * { * dbContext.SaveChanges(); * } * catch (Exception ex) * { * File.WriteAllText(@"F:\\Magistry files\WSB_crawler_Log.txt", ex.ToString()); * } * * } * } * * else * { * //return; * //System.Windows.MessageBox.Show("brak danych!"); * //File.WriteAllText(@"F:\\Magistry files\WSB_emptyLines.txt", "empty_line"); * continue; * } * * } * * else if (WSB_separatedContent.Length == 2 & (WSB_separatedContent[0].ToLower().Contains("tytul pracy") | WSB_separatedContent[0].Contains("Tytul pracy") | WSB_separatedContent[0] == "Tytul pracy")) * { * //System.Windows.MessageBox.Show(WSB_separatedContent[1]); * WSB_Tytul_pracy = WSB_separatedContent[1]; * } * * else if (WSB_separatedContent.Length == 2 & (WSB_separatedContent[0].ToLower().Contains("adres wydawniczy") | WSB_separatedContent[0].Contains("Adres wydawniczy") | WSB_separatedContent[0] == "Adres wydawniczy")) * { * //System.Windows.MessageBox.Show(WSB_separatedContent[1]); * WSB_Adres_wydawniczy = WSB_separatedContent[1]; * } * * * else if (WSB_separatedContent.Length == 2 & (WSB_separatedContent[0].ToLower().Contains("polskie hasla") | WSB_separatedContent[0].Contains("Polskie hasla") | WSB_separatedContent[0] == "Polskie hasla przedmiotowe")) * { * //System.Windows.MessageBox.Show(WSB_separatedContent[1]); * WSB_Slowa_kluczowe_j_pl = WSB_separatedContent[1].Split(separators, StringSplitOptions.RemoveEmptyEntries); * WSB_Slowa_kluczowe_j_pl_line = WSB_separatedContent[1]; * } * * else if (WSB_separatedContent.Length == 2 & (WSB_separatedContent[0].ToLower().Contains("angielskie hasla") | WSB_separatedContent[0].Contains("Angielskie hasla") | WSB_separatedContent[0] == "Angielskie hasla przedmiotowe")) * { * //System.Windows.MessageBox.Show(WSB_separatedContent[1]); * WSB_Slowa_kluczowe_j_ang = WSB_separatedContent[1].Split(separators, StringSplitOptions.RemoveEmptyEntries); * WSB_Slowa_kluczowe_j_ang_line = WSB_separatedContent[1]; * } * * else if (WSB_separatedContent.Length == 2 & (WSB_separatedContent[0].ToLower().Contains("tytul calosci") | WSB_separatedContent[0].Contains("Tytul calosci") | WSB_separatedContent[0] == "Tytul calosci")) * { * //System.Windows.MessageBox.Show(WSB_separatedContent[1]); * WSB_Tytul_calosci = WSB_separatedContent[1]; * } * * else if (WSB_separatedContent.Length == 2 & (WSB_separatedContent[0].ToLower().Contains("doi") | WSB_separatedContent[0].Contains("DOI") | WSB_separatedContent[0] == "DOI")) * { * //System.Windows.MessageBox.Show(WSB_separatedContent[1]); * WSB_DOI = WSB_separatedContent[1]; * } * * else if (WSB_separatedContent.Length == 2 & (WSB_separatedContent[0].ToLower().Contains("tytul pracy w innym") | WSB_separatedContent[0].Contains("Tytul pracy w innym") | WSB_separatedContent[0] == "Tytul pracy w innym jezyku")) * { * //System.Windows.MessageBox.Show(WSB_separatedContent[1]); * WSB_Tytul_pracy_w_innym_j = WSB_separatedContent[1]; * } * * else if (WSB_separatedContent.Length == 2 & (WSB_separatedContent[0].ToLower().Contains("szczegoly") | WSB_separatedContent[0].Contains("Szczegoly") | WSB_separatedContent[0] == "Szczegoly")) * { * //System.Windows.MessageBox.Show(WSB_separatedContent[1]); * WSB_Szczegoly = WSB_separatedContent[1]; * } * * else if (WSB_separatedContent.Length == 2 & (WSB_separatedContent[0].ToLower().Contains("url") | WSB_separatedContent[0].Contains("Url") | WSB_separatedContent[0] == "Adres url")) * { * //System.Windows.MessageBox.Show(WSB_separatedContent[1]); * WSB_URL = WSB_separatedContent[1]; * } * * //else if (PP_separatedContent.Length == 1 & PP_separatedContent[0] == String.Empty) System.Windows.MessageBox.Show("The empty line detected", "Empty line", System.Windows.MessageBoxButton.OK); * //else System.Windows.MessageBox.Show("Error! Content not found!", "Error!", System.Windows.MessageBoxButton.OK); * } * } */ #endregion } }
public static void GenerateArticlesToCSVandJsonFromDB(string articlesCSV, string articlesJson) { string csvContent = string.Empty; string jsonContent = "var articles = ["; csvContent += articleHeaderCSV; using (var PG_dbcontext = new ArticleDBDataModelContainer()) { var resul_PG = PG_dbcontext.PG_ArticlesSet.SqlQuery("SELECT * FROM dbo.PG_ArticlesSet").ToList(); if (resul_PG != null) { foreach (var item in resul_PG) { ArticlesJsonObj articlesJsonObj = new ArticlesJsonObj(item.article_Id, item.title, item.abstractText, item.keywords, item.year.ToString(), item.authors, item.url); csvContent += ("\"" + item.article_Id + "\",") + ("\"" + item.title + "\",") + ("\"" + item.abstractText + "\",") + ("\"" + item.keywords + "\",") + ("\"" + item.year + "\",") + ("\"" + item.authors + "\",") + ("\"" + item.url + "\"") + '\n'; jsonContent += JsonConvert.SerializeObject(articlesJsonObj) + '\n'; } } } using (var PP_dbcontext = new ArticleDBDataModelContainer()) { var resul_PP = PP_dbcontext.PP_ArticlesSet.SqlQuery("SELECT * FROM dbo.PP_ArticlesSet").ToList(); if (resul_PP != null) { foreach (var item in resul_PP) { ArticlesJsonObj articlesJsonObj = new ArticlesJsonObj(item.article_Id, item.article_title, string.Empty, string.Empty, item.article_year.ToString(), item.article_author_line, item.article_DOI); csvContent += ("\"" + item.article_Id + "\",") + ("\"" + item.article_title + "\",") + ("\"" + "" + "\",") + ("\"" + "" + "\",") + ("\"" + item.article_year + "\",") + ("\"" + item.article_author_line + "\",") + ("\"" + item.article_DOI + "\"") + '\n'; jsonContent += JsonConvert.SerializeObject(articlesJsonObj) + '\n'; } } } using (var UG_dbcontext = new ArticleDBDataModelContainer()) { var resul_UG = UG_dbcontext.UG_ArticlesSet.SqlQuery("SELECT * FROM dbo.UG_ArticlesSet").ToList(); if (resul_UG != null) { foreach (var item in resul_UG) { ArticlesJsonObj articlesJsonObj = new ArticlesJsonObj(item.article_Id, item.article_title, string.Empty, item.article_keywords, string.Empty, item.article_author_line, item.article_DOI); csvContent += ("\"" + item.article_Id + "\",") + ("\"" + item.article_title + "\",") + ("\"" + "" + "\",") + ("\"" + item.article_keywords + "\",") + ("\"" + "" + "\",") + ("\"" + item.article_author_line + "\",") + ("\"" + item.article_DOI + "\"") + '\n'; jsonContent += JsonConvert.SerializeObject(articlesJsonObj) + '\n'; } } } using (var UMK_dbcontext = new ArticleDBDataModelContainer()) { var resul_UMK = UMK_dbcontext.UMK_ArticlesSet.SqlQuery("SELECT * FROM dbo.UMK_ArticlesSet").ToList(); if (resul_UMK != null) { foreach (var item in resul_UMK) { ArticlesJsonObj articlesJsonObj = new ArticlesJsonObj(item.article_Id, item.article_title + " " + item.article_Full_title + " " + item.article_translated_title, string.Empty, item.article_pl_keywords + " " + item.article_eng_keywords, string.Empty, item.article_author_line, item.article_url); csvContent += ("\"" + item.article_Id + "\",") + ("\"" + item.article_title + " " + item.article_Full_title + " " + item.article_translated_title + "\",") + ("\"" + "" + "\",") + ("\"" + item.article_pl_keywords + " " + item.article_eng_keywords + "\",") + ("\"" + "" + "\",") + ("\"" + item.article_author_line + "\",") + ("\"" + item.article_url + "\"") + '\n'; jsonContent += JsonConvert.SerializeObject(articlesJsonObj) + '\n'; } } } using (var WSB_dbcontext = new ArticleDBDataModelContainer()) { var resul_WSB = WSB_dbcontext.WSB_ArticlesSet.SqlQuery("SELECT * FROM dbo.WSB_ArticlesSet").ToList(); if (resul_WSB != null) { foreach (var item in resul_WSB) { ArticlesJsonObj articlesJsonObj = new ArticlesJsonObj(item.article_Id, item.article_title + " " + item.article_common_title + " " + item.article_title_other_lang, string.Empty, item.article_pl_keywords + " " + item.article_eng_keywords, string.Empty, item.article_authors, item.article_URL); csvContent += ("\"" + item.article_Id + "\",") + ("\"" + item.article_title + " " + item.article_common_title + " " + item.article_title_other_lang + "\",") + ("\"" + " " + "\",") + ("\"" + item.article_pl_keywords + " " + item.article_eng_keywords + "\",") + ("\"" + " " + "\",") + ("\"" + item.article_authors + "\",") + ("\"" + item.article_URL + "\"") + '\n'; jsonContent += JsonConvert.SerializeObject(articlesJsonObj) + '\n'; } } } jsonContent += "]"; using (StreamWriter csv_SW = File.AppendText(articlesCSV)) { csv_SW.Write(csvContent); } using (StreamWriter json_SW = File.AppendText(articlesJson)) { json_SW.Write(jsonContent); } }
private static DbSet <Terms_Vocabulary> GetTerms_Vocabulary(ArticleDBDataModelContainer dbcon) { return(dbcon.Terms_Vocabulary); }
public static void get_UG_Document_content() { string[] UG_newcontent = new string[hapDoc.DocumentNode.InnerText.Length]; string[] UG_separatedContent = new string[hapDoc.DocumentNode.InnerText.Length]; UG_articles_Count = 0; string[] UG_articles_Matrix = { String.Empty }; using (StringReader sr = new StringReader(endText)) { int p = 0; string UG_line; while ((UG_line = sr.ReadLine()) != null) { UG_newcontent[p] = UG_line; UG_separatedContent = UG_line.Split(line_separator, 2); if (UG_separatedContent.Length == 1 & UG_separatedContent[0] == "") { continue; } else if (UG_separatedContent.Length == 1 && UG_articles_Matrix.Any(x => UG_separatedContent[0].Contains(x))) { if (UG_author_line != null && UG_Tytul != null) { using (var dbContext = new ArticleDBDataModelContainer()) { var document = new StringBuilder(); var ug_article = dbContext.UG_ArticlesSet.Create(); ug_article.article_author_line = UG_author_line; UG_author_line = null; ug_article.article_keywords = UG_slowa_kluczowe_j_ang_line; if (UG_slowa_kluczowe_j_ang_line != String.Empty || UG_slowa_kluczowe_j_ang_line != " " || UG_slowa_kluczowe_j_ang_line != null) { var termEngKeywords = TextPreparing.TermsPrepataions(UG_slowa_kluczowe_j_ang_line); document.Append(termEngKeywords); } UG_slowa_kluczowe_j_ang_line = null; ug_article.article_source = UG_Zrodlo; UG_Zrodlo = null; ug_article.article_title = UG_Tytul; if (UG_Tytul != String.Empty || UG_Tytul != " " || UG_Tytul != null) { var term_UG_Title = TextPreparing.TermsPrepataions(UG_Tytul); document.Append(term_UG_Title); } UG_Tytul = null; ug_article.article_DOI = UG_DOI; UG_DOI = null; for (int k = 0; k <= UG_autors.Length - 2;) { var authors_of_the_article = dbContext.AuthorSet.Create(); authors_of_the_article.author_name = UG_autors[k]; authors_of_the_article.author_surename = UG_autors[k + 1]; ug_article.Author.Add(authors_of_the_article); k += 2; } dbContext.UG_ArticlesSet.Add(ug_article); var _document = document.ToString().Split(' ', ';', ':', ','); for (int k = 0; k <= _document.Length - 1; k++) { var terms = dbContext.Terms_Vocabulary.Create(); string dictionary_text = File.ReadAllText(@"F:\Magistry files\csv_files\Allowed_term_dictionary.csv"); string[] allowed_dictionary = dictionary_text.Split(',', '\n'); for (int d = 0; d <= _document.Length - 1; d++) { for (int j = 0; j <= allowed_dictionary.Length - 1; j++) { if (_document[d].Length > 3 && _document[d].Contains(allowed_dictionary[j])) { continue; } else if (_document[d].Length <= 3 && !(_document[d].Contains(allowed_dictionary[j]))) { _document.ToList().RemoveAt(d); } } } //tutaj potrzebnie przepisac id dokumenta w ktorym wystepuje dane slowo if (_document[k] != String.Empty || _document[k] != " " || _document[k] != null || _document[k] != Char.IsDigit(' ').ToString() || dbContext.Terms_Vocabulary.Any(o => o.term_value != _document[k])) { //dbContext.Terms_Vocabulary.Where(u) var termVocabularyTable = dbContext.Terms_Vocabulary; terms.term_value = _document[k]; } try { ug_article.Terms_Vocabulary.Add(terms); } catch (Exception addingTermToDB) { File.WriteAllText(@"F:\\Magistry files\UG_crawler_Log.txt", DateTime.Now.ToString() + addingTermToDB.ToString()); } } try { dbContext.SaveChanges(); } catch (Exception ex) { File.WriteAllText(@"F:\\Magistry files\UG_crawler_Log.txt", DateTime.Now.ToString() + ex.ToString()); } } } else { File.WriteAllText(@"F:\\Magistry files\UG_crawler_Log.txt", "Empty line detected." + '\n'); } } else if (UG_separatedContent.Length == 2 && (UG_separatedContent[0].Contains("Liczba odnalezionych") || UG_separatedContent[0] == "Liczba odnalezionych rekordow")) { UG_articles_Count = Convert.ToInt32(UG_separatedContent[1]); UG_articles_Matrix = new string[UG_articles_Count]; for (int z = 0; z <= UG_articles_Count - 1; z++) { UG_articles_Matrix[z] = (z + 1) + "."; } } else if (UG_separatedContent.Length == 2 && UG_separatedContent[0].ToLower().Contains("autorzy")) { UG_author_line = UG_separatedContent[1]; UG_autors = UG_separatedContent[1].Split(autor_separators, StringSplitOptions.RemoveEmptyEntries); } else if (UG_separatedContent.Length == 2 && (UG_separatedContent[0].ToLower().Contains("tytu") || UG_separatedContent[0].ToLower().Contains("tytul") || UG_separatedContent[0].Contains("TYTUL") || UG_separatedContent[0] == "TYTUL[ROZDZIALU, FRAGMENTU]" || UG_separatedContent[0].Contains("TYTUL[ROZDZIALU, FRAGMENTU]") || UG_separatedContent[0].ToLower().Contains("TYTUL[ROZDZIALU, FRAGMENTU]"))) { UG_Tytul = UG_separatedContent[1]; } else if (UG_separatedContent.Length == 2 && UG_separatedContent[0].ToLower().Contains("zrodlo")) { UG_Zrodlo = UG_separatedContent[1]; } else if (UG_separatedContent.Length == 2 && UG_separatedContent[0].Contains("Slowa kluczowe w j. ang.")) { UG_Slowa_kluczowe_j_ang = UG_separatedContent[1].Split(separators); UG_slowa_kluczowe_j_ang_line = UG_separatedContent[1]; } else if (UG_separatedContent.Length == 2 && (UG_separatedContent[0] == "DOI" || UG_separatedContent.Contains("DOI") || UG_separatedContent[0].ToLower().Contains("doi"))) { UG_DOI = UG_separatedContent[1]; } p++; } #region Old_reader_code // 21.08.2018 - Old version of code /* * for (int i = 0; i <= hapDoc.DocumentNode.InnerText.Length; i++) * { * * UG_line = sr.ReadLine(); * if (UG_line != null) * { * UG_newcontent[i] = UG_line; * UG_separatedContent = UG_line.Split(line_separator, 2); * * if (UG_separatedContent.Length == 2 && UG_separatedContent[0].ToLower().Contains("autorzy")) * { * UG_author_line = UG_separatedContent[1]; * UG_autors = UG_separatedContent[1].Split(autor_separators, StringSplitOptions.RemoveEmptyEntries); * * } * else if (UG_separatedContent.Length == 2 && (UG_separatedContent[0].Contains("Liczba odnalezionych") || UG_separatedContent[0] == "Liczba odnalezionych rekordow")) * { * UG_articles_Count = Convert.ToInt32(UG_separatedContent[1]); * UG_articles_Matrix = new string[UG_articles_Count]; * for (int z = 0; z <= UG_articles_Count - 1; z++) * { * UG_articles_Matrix[z] = (z + 1) + "."; * } * } * else if (UG_separatedContent.Length == 1 && UG_articles_Matrix.Any(x => UG_separatedContent[0].Contains(x))) * { * if (UG_author_line != null && UG_Tytul != null) * { * using(var dbContext = new ArticleDBDataModelContainer()) * { * var document = new StringBuilder(); * var ug_article = dbContext.UG_ArticlesSet.Create(); * * ug_article.article_author_line = UG_author_line; * UG_author_line = null; * * ug_article.article_keywords = UG_slowa_kluczowe_j_ang_line; * if (UG_slowa_kluczowe_j_ang_line != String.Empty || UG_slowa_kluczowe_j_ang_line != " " || UG_slowa_kluczowe_j_ang_line != null) * { * var termEngKeywords = TextPreparing.TermsPrepataions(UG_slowa_kluczowe_j_ang_line); * document.Append(termEngKeywords); * } * UG_slowa_kluczowe_j_ang_line = null; * * ug_article.article_source = UG_Zrodlo; * UG_Zrodlo = null; * * ug_article.article_title = UG_Tytul; * if (UG_Tytul != String.Empty || UG_Tytul != " " || UG_Tytul != null) * { * var term_UG_Title = TextPreparing.TermsPrepataions(UG_Tytul); * document.Append(term_UG_Title); * } * UG_Tytul = null; * * ug_article.article_DOI = UG_DOI; * UG_DOI = null; * * for (int k = 0; k <= UG_autors.Length - 2;) * { * var authors_of_the_article = dbContext.AuthorSet.Create(); * authors_of_the_article.author_name = UG_autors[k]; * authors_of_the_article.author_surename = UG_autors[k + 1]; * ug_article.Author.Add(authors_of_the_article); * k += 2; * } * dbContext.UG_ArticlesSet.Add(ug_article); * var _document = document.ToString().Split(' ', ';', ':', ','); * for (int k = 0; k <= _document.Length - 1; k++) * { * var terms = dbContext.Terms_Vocabulary.Create(); * string dictionary_text = File.ReadAllText(@"F:\Magistry files\csv_files\Allowed_term_dictionary.csv"); * string[] allowed_dictionary = dictionary_text.Split(',', '\n'); * * for (int d = 0; d <= _document.Length - 1; d++) * { * for (int j = 0; j <= allowed_dictionary.Length - 1; j++) * if (_document[d].Length > 3 && _document[d].Contains(allowed_dictionary[j])) * continue; * else if (_document[d].Length <= 3 && !(_document[d].Contains(allowed_dictionary[j]))) * _document.ToList().RemoveAt(d); * } * * //tutaj potrzebnie przepisac id dokumenta w ktorym wystepuje dane slowo * if (_document[k] != String.Empty || _document[k] != " " || _document[k] != null || _document[k] != Char.IsDigit(' ').ToString()) * { * //dbContext.Terms_Vocabulary.Where(u) * var termVocabularyTable = dbContext.Terms_Vocabulary; * terms.term_value = _document[k]; * * } * ug_article.Terms_Vocabulary.Add(terms); * } * try * { * dbContext.SaveChanges(); * } * catch(Exception ex) * { * File.WriteAllText(@"F:\\Magistry files\UG_crawler_Log.txt", ex.ToString()); * } * } * * ///<summary> * /// UGArticle_Entity_Object_Creation * /// </summary> #region UGArticle_Entity_Object_Creation * using (var db = new PublicationsContext()) * { * var ug_article = new UGArticle(); * ug_article.article_author_line = UG_author_line; * UG_author_line = null; * ug_article.article_keywords = UG_slowa_kluczowe_j_ang_line; * UG_slowa_kluczowe_j_ang_line = null; * ug_article.article_source = UG_Zrodlo; * UG_Zrodlo = null; * ug_article.article_title = UG_Tytul; * UG_Tytul = null; * ug_article.article_DOI = UG_DOI; * UG_DOI = null; * * var authors_of_the_article = new Authors(); * for (int k = 0; k <= UG_autors.Length - 2; k++) * { * authors_of_the_article.author_name = UG_autors[k]; * authors_of_the_article.author_surename = UG_autors[k + 1]; * authors_of_the_article.article_Id = ug_article.article_Id; * * db.Authors.Add(authors_of_the_article); * } * * //authors_of_the_article.UG_Articles.Add(ug_article); * db.UG_Articles.Add(ug_article); * db.SaveChanges(); * } * else * { * File.WriteAllText(@"F:\\Magistry files\UG_crawler_Log.txt", "Empty line detected."+'\n'); * } * } * else if (UG_separatedContent.Length == 2 && (UG_separatedContent[0].ToLower().Contains("tytu") || UG_separatedContent[0].ToLower().Contains("tytul") || UG_separatedContent[0].Contains("TYTUL") || UG_separatedContent[0]=="TYTUL[ROZDZIALU, FRAGMENTU]" || UG_separatedContent[0].Contains("TYTUL[ROZDZIALU, FRAGMENTU]") || UG_separatedContent[0].ToLower().Contains("TYTUL[ROZDZIALU, FRAGMENTU]"))) * { * UG_Tytul = UG_separatedContent[1]; * } * else if (UG_separatedContent.Length == 2 && UG_separatedContent[0].ToLower().Contains("zrodlo")){ * UG_Zrodlo = UG_separatedContent[1]; * } * else if (UG_separatedContent.Length == 2 && UG_separatedContent[0].Contains("Slowa kluczowe w j. ang.")) * { * UG_Slowa_kluczowe_j_ang = UG_separatedContent[1].Split(separators); * UG_slowa_kluczowe_j_ang_line = UG_separatedContent[1]; * } * else if (UG_separatedContent.Length == 2 && (UG_separatedContent[0] == "DOI" || UG_separatedContent.Contains("DOI") || UG_separatedContent[0].ToLower().Contains("doi"))) * { * UG_DOI = UG_separatedContent[1]; * } * } * } */ #endregion } }
public static List <string> GenerateCollection() { List <string> DocumentCollection = new List <string>(); Stopwatch database_processing = Stopwatch.StartNew(); database_processing.Start(); using (var dbContext = new ArticleDBDataModelContainer()) { var resul_PG = dbContext.PG_ArticlesSet.SqlQuery("SELECT * FROM dbo.PG_ArticlesSet").ToList(); if (resul_PG != null) { foreach (var item in resul_PG) { if (item.title != null || item.title != String.Empty || item.abstractText != null || item.abstractText != String.Empty || item.keywords != null || item.keywords != String.Empty) { DocumentCollection.Add(item.title.ToLower() + item.abstractText.ToLower() + item.keywords.ToLower()); } } } var result_PP = dbContext.PP_ArticlesSet.SqlQuery("SELECT * FROM dbo.PP_ArticlesSet").ToList(); if (result_PP != null) { foreach (var PP_item in result_PP) { if (PP_item.article_title != null || PP_item.article_title != String.Empty || PP_item.article_source != null || PP_item.article_source != String.Empty) { DocumentCollection.Add(PP_item.article_title.ToLower() + PP_item.article_source.ToLower()); } } } var result_UG = dbContext.UG_ArticlesSet.SqlQuery("SELECT * FROM UG_ArticlesSet").ToList(); if (result_UG != null) { foreach (var UG_item in result_UG) { if (UG_item.article_title != null || UG_item.article_title != String.Empty || UG_item.article_keywords != null || UG_item.article_keywords != String.Empty) { DocumentCollection.Add(UG_item.article_title.ToLower() + UG_item.article_keywords.ToLower()); } } } var result_UMK = dbContext.UMK_ArticlesSet.SqlQuery("SELECT * FROM UMK_ArticlesSet").ToList(); if (result_UMK != null) { foreach (var UMK_item in result_UMK) { if (UMK_item.article_title != null || UMK_item.article_title != String.Empty || UMK_item.article_Full_title != null || UMK_item.article_Full_title != String.Empty || UMK_item.article_translated_title != null || UMK_item.article_translated_title != String.Empty || UMK_item.article_publisher_title != null || UMK_item.article_publisher_title != String.Empty || UMK_item.article_eng_keywords != null || UMK_item.article_eng_keywords != String.Empty || UMK_item.article_pl_keywords != null || UMK_item.article_pl_keywords != String.Empty) { DocumentCollection.Add(UMK_item.article_title.ToLower() + UMK_item.article_Full_title.ToLower() + UMK_item.article_translated_title.ToLower() + UMK_item.article_publisher_title.ToLower() + UMK_item.article_eng_keywords.ToLower() + UMK_item.article_pl_keywords.ToLower()); } } } var result_WSB = dbContext.WSB_ArticlesSet.SqlQuery("SELECT * FROM WSB_ArticlesSet").ToList(); if (result_WSB != null) { foreach (var WSB_item in result_WSB) { if (WSB_item.article_title != null || WSB_item.article_title != String.Empty || WSB_item.article_common_title != null || WSB_item.article_common_title != String.Empty || WSB_item.article_title_other_lang != null || WSB_item.article_title_other_lang != String.Empty || WSB_item.article_pl_keywords != null || WSB_item.article_pl_keywords != String.Empty || WSB_item.article_eng_keywords != null || WSB_item.article_eng_keywords != String.Empty) { DocumentCollection.Add(WSB_item.article_title.ToLower() + WSB_item.article_common_title.ToLower() + WSB_item.article_title_other_lang.ToLower() + WSB_item.article_pl_keywords.ToLower() + WSB_item.article_eng_keywords.ToLower()); } } } } /* * database_processing.Stop(); * string processing_log = @"F:\Magistry files\Processing_log.txt"; * * using (StreamWriter sw = File.AppendText(processing_log)) * { * sw.WriteLine(DateTime.Now.ToString() + " The database processing time is: " + database_processing.Elapsed.Minutes.ToString() + ":" + database_processing.Elapsed.TotalMilliseconds.ToString() + ", database context counter: " + counter2.ToString() + ", selection counter in one dbContext: " + counter1.ToString() + ", method executing counter: " + counter3.ToString()); * } */ return(DocumentCollection); }
internal static List <DocumentVector> DocumentCollectionProcessingDictionary(Dictionary <int, string> docCollectionDictionary) { parallelOption.MaxDegreeOfParallelism = 20; var vector_space_model_calculation = Stopwatch.StartNew(); termHashset = new HashSet <string>(); using (var dbContext = new ArticleDBDataModelContainer()) { dbContext.Terms_Vocabulary.Load(); foreach (var terms in dbContext.Terms_Vocabulary.Local) { termHashset.Add(terms.term_value.ToLower()); } } List <DocumentVector> documentVectorSpace = new List <DocumentVector>(); DocumentVector _documentVector; float[] space; int index = 0; var arrayOfDocs = docCollectionDictionary.Keys.ToArray(); Parallel.ForEach(docCollectionDictionary, parallelOption, document => { int count = 0; space = new float[termHashset.Count]; var collectionValue = docCollectionDictionary.Values.ToList(); foreach (string term in termHashset) { space[count] = Logic.ClusteringAlgorithms.Used_functions.TFIDF2ndrealization.FindTFIDF(collectionValue, document.Value, term); count++; } for (int i = 0; i < arrayOfDocs.Length; i++) { if (arrayOfDocs[i] == document.Key) { index = i; } } _documentVector = new DocumentVector(); _documentVector.ArticleID = document.Key; _documentVector.index_Of_Doc_for_labeling = index; _documentVector.Content = document.Value; _documentVector.VectorSpace = space; documentVectorSpace.Add(_documentVector); }); vector_space_model_calculation.Stop(); string processing_log = @"F:\Magistry files\Processing_log.txt"; using (StreamWriter sw = File.AppendText(processing_log)) { sw.WriteLine(DateTime.Now.ToString() + " The vector space model calculation time is: " + vector_space_model_calculation.Elapsed.Minutes.ToString() + ":" + vector_space_model_calculation.Elapsed.TotalMilliseconds.ToString()); } return(documentVectorSpace); }
public static List <string> GenerateTermCollection() { List <string> TermCollection = new List <string>(); char[] not_allowedChars = { '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '<', '>', 'x', '!', '#', '$', '%', '^', '&', '*', '(', ')', '/', '\'' }; using (var dbContext = new ArticleDBDataModelContainer()) { var resul_PG = dbContext.Terms_Vocabulary.SqlQuery("SELECT * FROM dbo.Terms_Vocabulary").ToList(); if (resul_PG != null) { foreach (var item in resul_PG) { if (item.term_value != null || item.term_value != String.Empty) { TermCollection.Add(item.term_value.ToLower()); } } } } string dictionary_text = File.ReadAllText(@"F:\Magistry files\csv_files\Allowed_term_dictionary.csv"); string[] allowed_dictionary = dictionary_text.Split(',', '\n'); for (int i = 0; i <= TermCollection.Count - 1; i++) { #region new_code_for_Cleaning_termVocabulary for (int k = 0; i < TermCollection[i].Length; k++) { for (int z = 0; z < not_allowedChars.Length; z++) { if (TermCollection[i].ElementAt(k) == not_allowedChars[z]) { TermCollection[i].Remove(k, 1); } } } #endregion for (int j = 0; j <= allowed_dictionary.Length - 1; j++) { if (TermCollection[i].Length <= 3 && (!TermCollection[i].Contains(allowed_dictionary[j]))) { TermCollection.RemoveAt(i); } else if (TermCollection[i].Contains(")") || TermCollection[i].Contains("(")) { TermCollection.RemoveAt(i); } else if (TermCollection[i].Contains("]") || TermCollection[i].Contains("[")) { TermCollection.RemoveAt(i); } else if (TermCollection[i].Contains("*") || TermCollection[i].Contains("*")) { TermCollection.RemoveAt(i); } else { continue; } } } for (int i = 0; i <= TermCollection.Count - 1; i++) { for (int j = 0; j <= TermCollection.Count - 1; j++) { if ((TermCollection[i] == TermCollection[j]) || TermCollection[i].Contains(TermCollection[j].Substring(0))) { TermCollection.RemoveAt(j); } } } return(TermCollection); }
public static string TermsPrepataions(string _text) { ArticleDBDataModelContainer dbContainer = new ArticleDBDataModelContainer(); List <string> resultDBList = new List <string>(); var TermsDBList = dbContainer.Set <Terms_Vocabulary>(); foreach (var element in TermsDBList) { resultDBList.Add(element.term_value); } //here we will have the list of terms var text_preparation = Stopwatch.StartNew(); string text = _text; List <string> Words = new List <string>(); char[] splitChars = { ' ', ',', '.', ';', '-', ':' }; string[] removableWords = { "and", "or", "it", "at", "all", "in", "on", "under", "between", "a", "an", "the", "to", "pod", "nad", "tam", "tutaj", "między", "pomiędzy", "w", "przed", "się", "z", "na", "od", "jest", "iż", "co", "we", "ich", "ciebie", "ja", "ty", "ona", "ono", "oni", "owych", "of", "cz", "do", "s", "n", "r", "nr", "rys", "i", "by", "from", "o", "//", "**", "po", "jej", "przy", "rzecz", "jak", "tymi", "są", "czy", "oraz", "ze", "m", "p", "off", "for", "/", "is", "as", "be", "will", "go", "za", "też", "lub", "t", "poz", "wiad", "set", "use", "etc", "also", "are", "tzw", "out", "other", "its", "has", "<", ">", "pre", "its", "has", "are", "with", "[et", "]", "vol", "leszek", "j", "al", "tych", "tym" }; //Regex reqular_expression = new Regex(@"(\d)\)+"); Regex regular_expression = new Regex("[^0-9A-Za-z]+"); if (!string.IsNullOrEmpty(text)) { string[] splittedTitle = text.ToLower().Split(splitChars, StringSplitOptions.RemoveEmptyEntries); Words = splittedTitle.ToList(); } else { return("NULL"); } char[] not_allowed_chars = { '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '!', '@', '#', '$', '%', '^', '&', '\'', '"', '[', ']', '{', '}', '(', ')' }; for (int k1 = 0; k1 < resultDBList.Count; k1++) { foreach (var word in Words.ToList()) { foreach (var not_allow_ch in not_allowed_chars) { if (word.Length < 3 | word.Contains(not_allow_ch) | resultDBList.Contains(word) | resultDBList[k1] == word) { Words.Remove(word); } } } } var stemmer = new EnglishStemmer(); var stemmingList = Words.ToList().Where(w => w.Any(c => !Char.IsDigit(c))).ToList(); HashSet <string> stemmingHashSet = new HashSet <string>(); foreach (var term in Words) { stemmingHashSet.Add(term); } Words = stemmingHashSet.ToList(); var splittedTitle1 = Words.ToArray(); string dictionary_text = File.ReadAllText(@"F:\Magistry files\csv_files\Allowed_term_dictionary.csv"); string[] allowed_dictionary = dictionary_text.Split(',', '\n'); for (int i = 0; i <= splittedTitle1.Length - 1; i++) { for (int j = 0; j <= allowed_dictionary.Length - 1; j++) { if (splittedTitle1[i].Length > 3 && splittedTitle1[i].Contains(allowed_dictionary[j])) { continue; } else if (splittedTitle1[i].Length <= 3 && !(splittedTitle1[i].Contains(allowed_dictionary[j]))) { splittedTitle1.ToList().RemoveAt(i); } } } var stemmingString = string.Join(" ", splittedTitle1.Except(removableWords).Distinct()); var stemmingString1 = regular_expression.Replace(stemmingString, String.Empty); text_preparation.Stop(); //System.Windows.MessageBox.Show("The text processing time is: "+ text_preparation.Elapsed.Minutes.ToString() + ":" + text_preparation.Elapsed.TotalMilliseconds, "Text processing time" ,System.Windows.MessageBoxButton.OK); string processing_log = @"F:\Magistry files\Processing_log.txt"; using (StreamWriter sw = File.AppendText(processing_log)) { sw.WriteLine(DateTime.Now.ToString() + "The text processing time is: " + text_preparation.Elapsed.Minutes.ToString() + ":" + text_preparation.Elapsed.TotalMilliseconds.ToString()); } Debug.WriteLine("The text processing time is: " + text_preparation.Elapsed.Minutes.ToString() + ":" + text_preparation.Elapsed.TotalMilliseconds, "Text processing time", System.Windows.MessageBoxButton.OK); return(stemmingString); }