public static int savePageData(string stock, string subject, string threadCodeOrg, string threadLink, string authorPostLink) { var db = new HCDB(); int startIndex = 0; int endIndex = 0; // Page Number int pageNum = 1; if (subject.Substring(0, 4).ToLower().Contains("re:")) { startIndex = threadLink.IndexOf("page-") + 5; threadLink = threadLink.Substring(startIndex, threadLink.Length - startIndex); endIndex = threadLink.IndexOf("?"); pageNum = Convert.ToInt32(threadLink.Substring(0, endIndex)); } // Author Link startIndex = threadCodeOrg.IndexOf("class=\"avatar\""); threadCodeOrg = threadCodeOrg.Substring(startIndex, threadCodeOrg.Length - startIndex); startIndex = threadCodeOrg.IndexOf("a href"); threadCodeOrg = threadCodeOrg.Substring(startIndex, threadCodeOrg.Length - startIndex); startIndex = threadCodeOrg.IndexOf("\"") + 1; threadCodeOrg = threadCodeOrg.Substring(startIndex, threadCodeOrg.Length - startIndex); endIndex = threadCodeOrg.IndexOf("\""); string authorLink = "http://hotcopper.com.au/" + threadCodeOrg.Substring(0, endIndex); if (authorLink.Trim() != "") { saveAuthorData(authorLink, authorPostLink); } // Author startIndex = threadCodeOrg.IndexOf("<h"); threadCodeOrg = threadCodeOrg.Substring(startIndex, threadCodeOrg.Length - startIndex); startIndex = threadCodeOrg.IndexOf(">") + 1; threadCodeOrg = threadCodeOrg.Substring(startIndex, threadCodeOrg.Length - startIndex); endIndex = threadCodeOrg.IndexOf("</"); string postAuthor = threadCodeOrg.Substring(0, endIndex); if (postAuthor.Contains("<a")) { endIndex = postAuthor.IndexOf("<a"); string firstPart = postAuthor.Substring(0, endIndex); postAuthor = postAuthor.Substring(endIndex + 2, postAuthor.Length - (endIndex + 2)); startIndex = postAuthor.IndexOf(">") + 1; string secondPart = postAuthor.Substring(startIndex, postAuthor.Length - startIndex); postAuthor = firstPart + "\n" + secondPart; } postAuthor = postAuthor.Replace("</a>", "").Trim(); postAuthor = postAuthor.Replace("'", "'"); // Date, Time startIndex = threadCodeOrg.IndexOf("Date:"); threadCodeOrg = threadCodeOrg.Substring(startIndex, threadCodeOrg.Length - startIndex); startIndex = threadCodeOrg.IndexOf("<dd"); threadCodeOrg = threadCodeOrg.Substring(startIndex, threadCodeOrg.Length - startIndex); startIndex = threadCodeOrg.IndexOf(">") + 1; threadCodeOrg = threadCodeOrg.Substring(startIndex, threadCodeOrg.Length - startIndex); endIndex = threadCodeOrg.IndexOf("</"); string postDate = threadCodeOrg.Substring(0, endIndex); startIndex = threadCodeOrg.IndexOf("Time:"); threadCodeOrg = threadCodeOrg.Substring(startIndex, threadCodeOrg.Length - startIndex); startIndex = threadCodeOrg.IndexOf("<dd"); threadCodeOrg = threadCodeOrg.Substring(startIndex, threadCodeOrg.Length - startIndex); startIndex = threadCodeOrg.IndexOf(">") + 1; threadCodeOrg = threadCodeOrg.Substring(startIndex, threadCodeOrg.Length - startIndex); endIndex = threadCodeOrg.IndexOf("</"); string postTime = threadCodeOrg.Substring(0, endIndex); string postDateTimeStr = postDate + " " + postTime; DateTime postDateTime = Convert.ToDateTime(postDateTimeStr); // Post ID startIndex = threadCodeOrg.IndexOf("Post #:"); threadCodeOrg = threadCodeOrg.Substring(startIndex, threadCodeOrg.Length - startIndex); startIndex = threadCodeOrg.IndexOf("<dd"); threadCodeOrg = threadCodeOrg.Substring(startIndex, threadCodeOrg.Length - startIndex); startIndex = threadCodeOrg.IndexOf(">") + 1; threadCodeOrg = threadCodeOrg.Substring(startIndex, threadCodeOrg.Length - startIndex); endIndex = threadCodeOrg.IndexOf("</"); string postIDstr = threadCodeOrg.Substring(0, endIndex); if (postIDstr.Contains("<a")) { endIndex = postIDstr.IndexOf("<a"); string firstPart = postIDstr.Substring(0, endIndex); postIDstr = postIDstr.Substring(endIndex + 2, postIDstr.Length - (endIndex + 2)); startIndex = postIDstr.IndexOf(">") + 1; string secondPart = postIDstr.Substring(startIndex, postIDstr.Length - startIndex); postIDstr = firstPart + "\n" + secondPart; } postIDstr = postIDstr.Replace("</a>", "").Trim(); int postID = Convert.ToInt32(postIDstr); // IP string postIP = "Not Found"; startIndex = threadCodeOrg.IndexOf("IP:"); if (startIndex != -1) { threadCodeOrg = threadCodeOrg.Substring(startIndex, threadCodeOrg.Length - startIndex); startIndex = threadCodeOrg.IndexOf("<dd"); threadCodeOrg = threadCodeOrg.Substring(startIndex, threadCodeOrg.Length - startIndex); startIndex = threadCodeOrg.IndexOf(">") + 1; threadCodeOrg = threadCodeOrg.Substring(startIndex, threadCodeOrg.Length - startIndex); endIndex = threadCodeOrg.IndexOf("</"); postIP = threadCodeOrg.Substring(0, endIndex); } // CONTENT #region Content startIndex = threadCodeOrg.IndexOf("class=\"content\""); threadCodeOrg = threadCodeOrg.Substring(startIndex, threadCodeOrg.Length - startIndex); startIndex = threadCodeOrg.IndexOf("<article>") + 9; threadCodeOrg = threadCodeOrg.Substring(startIndex, threadCodeOrg.Length - startIndex); startIndex = threadCodeOrg.IndexOf("<blockquote"); threadCodeOrg = threadCodeOrg.Substring(startIndex, threadCodeOrg.Length - startIndex); startIndex = threadCodeOrg.IndexOf(">") + 1; threadCodeOrg = threadCodeOrg.Substring(startIndex, threadCodeOrg.Length - startIndex); endIndex = threadCodeOrg.IndexOf("</blockquote"); string story = threadCodeOrg.Substring(0, endIndex); while (story.Contains("<a")) { endIndex = story.IndexOf("<a"); string firstPart = story.Substring(0, endIndex); story = story.Substring(endIndex + 2, story.Length - (endIndex + 2)); startIndex = story.IndexOf(">") + 1; string secondPart = story.Substring(startIndex, story.Length - startIndex); story = firstPart + "\n" + secondPart; } while (story.Contains("<p")) { endIndex = story.IndexOf("<p"); string firstPart = story.Substring(0, endIndex); story = story.Substring(endIndex + 2, story.Length - (endIndex + 2)); startIndex = story.IndexOf(">") + 1; string secondPart = story.Substring(startIndex, story.Length - startIndex); story = firstPart + "\n" + secondPart; } while (story.Contains("<img")) { endIndex = story.IndexOf("<img"); string firstPart = story.Substring(0, endIndex); story = story.Substring(endIndex + 4, story.Length - (endIndex + 4)); startIndex = story.IndexOf(">") + 1; string secondPart = story.Substring(startIndex, story.Length - startIndex); story = firstPart + "\n" + secondPart; } while (story.Contains("<span")) { endIndex = story.IndexOf("<span"); string firstPart = story.Substring(0, endIndex); story = story.Substring(endIndex + 5, story.Length - (endIndex + 5)); startIndex = story.IndexOf(">") + 1; string secondPart = story.Substring(startIndex, story.Length - startIndex); story = firstPart + "\n" + secondPart; } while (story.Contains("<script")) { endIndex = story.IndexOf("<script"); string firstPart = story.Substring(0, endIndex); story = story.Substring(endIndex + 6, story.Length - (endIndex + 6)); startIndex = story.IndexOf(">") + 1; string secondPart = story.Substring(startIndex, story.Length - startIndex); story = firstPart + "\n" + secondPart; } while (story.Contains("<ins")) { endIndex = story.IndexOf("<ins"); string firstPart = story.Substring(0, endIndex); story = story.Substring(endIndex + 4, story.Length - (endIndex + 4)); startIndex = story.IndexOf(">") + 1; string secondPart = story.Substring(startIndex, story.Length - startIndex); story = firstPart + "\n" + secondPart; } while (story.Contains("<div")) { endIndex = story.IndexOf("<div"); string firstPart = story.Substring(0, endIndex); story = story.Substring(endIndex + 4, story.Length - (endIndex + 4)); startIndex = story.IndexOf(">") + 1; string secondPart = story.Substring(startIndex, story.Length - startIndex); story = firstPart + "\n" + secondPart; } while (story.Contains("<link")) { endIndex = story.IndexOf("<link"); string firstPart = story.Substring(0, endIndex); story = story.Substring(endIndex + 5, story.Length - (endIndex + 5)); startIndex = story.IndexOf(">") + 1; string secondPart = story.Substring(startIndex, story.Length - startIndex); story = firstPart + "\n" + secondPart; } while (story.Contains("<video")) { endIndex = story.IndexOf("<video"); string firstPart = story.Substring(0, endIndex); story = story.Substring(endIndex + 6, story.Length - (endIndex + 6)); startIndex = story.IndexOf(">") + 1; string secondPart = story.Substring(startIndex, story.Length - startIndex); story = firstPart + "\n" + secondPart; } while (story.Contains("<source")) { endIndex = story.IndexOf("<source"); string firstPart = story.Substring(0, endIndex); story = story.Substring(endIndex + 7, story.Length - (endIndex + 7)); startIndex = story.IndexOf(">") + 1; string secondPart = story.Substring(startIndex, story.Length - startIndex); story = firstPart + "\n" + secondPart; } while (story.Contains("<style>")) { endIndex = story.IndexOf("<style>"); string firstPart = story.Substring(0, endIndex); story = story.Substring(endIndex + 7, story.Length - (endIndex + 7)); startIndex = story.IndexOf("</style>") + 8; string secondPart = story.Substring(startIndex, story.Length - startIndex); story = firstPart + "\n" + secondPart; } while (story.Contains("<!--")) { endIndex = story.IndexOf("<!--"); string firstPart = story.Substring(0, endIndex); story = story.Substring(endIndex + 3, story.Length - (endIndex + 3)); startIndex = story.IndexOf("-->") + 3; string secondPart = ""; if (story.IndexOf("-->") != -1) { secondPart = story.Substring(startIndex, story.Length - startIndex); } story = firstPart + "\n" + secondPart; } story = story.Replace("</style>", ""); story = story.Replace("</span>", ""); story = story.Replace("</script>", ""); story = story.Replace("</video>", ""); story = story.Replace("</a>", ""); story = story.Replace("<br />", ""); story = story.Replace("</ul>", ""); story = story.Replace("</li>", ""); story = story.Replace("</tr>", ""); story = story.Replace("</td>", ""); story = story.Replace("<p>", ""); story = story.Replace("</p>", ""); story = story.Replace("<P>", ""); story = story.Replace("</P>", ""); story = story.Replace("</ins>", ""); story = story.Replace("</div>", ""); story = story.Replace("<em>", ""); story = story.Replace("</em>", ""); story = story.Replace(" ", " "); story = story.Replace("‘", "'"); story = story.Replace("’", "'"); story = story.Replace("“", "\""); story = story.Replace("”", "\""); story = story.Replace(""", "\""); story = story.Replace("&", "&"); story = story.Replace("’", "'"); story = story.Replace("“", "\""); story = story.Replace("”", "\""); story = story.Replace(" — ", " - "); story = story.Replace("\n", ""); story = story.Replace("\t", ""); story = story.Replace("'", "'"); story = story.Replace("<b>", ""); story = story.Replace("</b>", ""); story = story.Replace("<br>", ""); story = story.Replace("<strong>", ""); story = story.Replace("</strong>", "").Trim(); #endregion // Length of the post (Word Count - compared with MS word) //int postWordCount1 = Regex.Matches(story, @"[A-Za-z0-9]+").Count; int postWordCount = Regex.Matches(story, @"[\S]+").Count; // Best Method so far //int postWordCount3 = story.Split().Length; //int postWordCount4 = story.Count(Char.IsWhiteSpace); // -1 //int postWordCount = Math.Min(Math.Min(postWordCount1, postWordCount2), Math.Min(postWordCount3, postWordCount4)); // Likes startIndex = threadCodeOrg.IndexOf("icon icon-like"); threadCodeOrg = threadCodeOrg.Substring(startIndex, threadCodeOrg.Length - startIndex); startIndex = threadCodeOrg.IndexOf("</span>") + 7; threadCodeOrg = threadCodeOrg.Substring(startIndex, threadCodeOrg.Length - startIndex); endIndex = threadCodeOrg.IndexOf("</"); string likesStr = threadCodeOrg.Substring(0, endIndex); likesStr = likesStr.Replace(",", ""); int likes = 0; if (!Int32.TryParse(likesStr, out likes)) { likes = 0; } // Price at Posting Decimal priceAtPosting = 0; startIndex = threadCodeOrg.IndexOf("Price at posting"); if (startIndex != -1) { threadCodeOrg = threadCodeOrg.Substring(startIndex, threadCodeOrg.Length - startIndex); startIndex = threadCodeOrg.IndexOf("$") + 1; threadCodeOrg = threadCodeOrg.Substring(startIndex, threadCodeOrg.Length - startIndex); endIndex = threadCodeOrg.IndexOf("</"); if (!threadCodeOrg.Substring(0, endIndex).Contains("(")) { if (threadCodeOrg.Trim() != "" && threadCodeOrg != null) { if (!Decimal.TryParse(threadCodeOrg.Substring(0, endIndex), out priceAtPosting)) { //Decimal parsing has failed priceAtPosting = -1M; } } } } // Sentiment string sentiment = "Not Found"; startIndex = threadCodeOrg.IndexOf("Sentiment"); if (startIndex != -1) { threadCodeOrg = threadCodeOrg.Substring(startIndex, threadCodeOrg.Length - startIndex); startIndex = threadCodeOrg.IndexOf("<dd>") + 4; threadCodeOrg = threadCodeOrg.Substring(startIndex, threadCodeOrg.Length - startIndex); endIndex = threadCodeOrg.IndexOf("</"); sentiment = threadCodeOrg.Substring(0, endIndex); } // Disclosure string disclosure = "Not Found"; startIndex = threadCodeOrg.IndexOf("Disclosure"); if (startIndex != -1) { threadCodeOrg = threadCodeOrg.Substring(startIndex, threadCodeOrg.Length - startIndex); startIndex = threadCodeOrg.IndexOf("<dd>") + 4; threadCodeOrg = threadCodeOrg.Substring(startIndex, threadCodeOrg.Length - startIndex); endIndex = threadCodeOrg.IndexOf("</"); disclosure = threadCodeOrg.Substring(0, endIndex); } if (!db.Posts.Any(f => f.Subject == subject && f.PageNum == pageNum && f.Post_ID == postID)) { db.Posts.Add(new HCDB_Posts { Stock = stock, Subject = subject, PageNum = pageNum, Content = story, Likes = likes, DateTime = postDateTime, Author = postAuthor, Post_ID = postID, IP = postIP, Length_of_Post = postWordCount, Price_at_Posting = priceAtPosting, Disclosure = disclosure, Sentiment = sentiment }); db.SaveChanges(); return(1); } else { var existing = (from u in db.Posts where u.Subject == subject && u.PageNum == pageNum && u.Post_ID == postID select u).FirstOrDefault(); if (existing != null) { if (existing.Likes != likes) { existing.Likes = likes; } if (existing.Price_at_Posting != priceAtPosting) { existing.Price_at_Posting = priceAtPosting; } db.SaveChanges(); } return(0); } }
public static void saveAuthorData(string authorLink, string authorPostLink) { var db = new HCDB(); try { //MessageBox.Show(authorLink); string sourceCode = getSourceCode(authorLink); if (sourceCode == "invalid") { throw new UriFormatException(); } int startIndex = 0; int endIndex = 0; // Name startIndex = sourceCode.IndexOf("itemprop=\"name"); sourceCode = sourceCode.Substring(startIndex, sourceCode.Length - startIndex); startIndex = sourceCode.IndexOf(">") + 1; sourceCode = sourceCode.Substring(startIndex, sourceCode.Length - startIndex); endIndex = sourceCode.IndexOf("</"); string authorName = sourceCode.Substring(0, endIndex); // Posts startIndex = sourceCode.IndexOf("stat-post"); sourceCode = sourceCode.Substring(startIndex, sourceCode.Length - startIndex); startIndex = sourceCode.IndexOf("</div>") + 6; sourceCode = sourceCode.Substring(startIndex, sourceCode.Length - startIndex); endIndex = sourceCode.IndexOf("<"); string postsNumStr = sourceCode.Substring(0, endIndex).Trim().Replace(",", ""); int postsTotalNum = 0; if (!int.TryParse(postsNumStr, out postsTotalNum)) { postsTotalNum = -1; } // Likes Received startIndex = sourceCode.IndexOf("stat-likes"); sourceCode = sourceCode.Substring(startIndex, sourceCode.Length - startIndex); startIndex = sourceCode.IndexOf("</div>") + 6; sourceCode = sourceCode.Substring(startIndex, sourceCode.Length - startIndex); endIndex = sourceCode.IndexOf("<"); int likesReceived = 0; string likesReceivedStr = sourceCode.Substring(0, endIndex).Trim().Replace(",", ""); if (!int.TryParse(likesReceivedStr, out likesReceived)) { likesReceived = -1; } // Following // Its Number startIndex = sourceCode.IndexOf("stat-following"); sourceCode = sourceCode.Substring(startIndex, sourceCode.Length - startIndex); startIndex = sourceCode.IndexOf("</div>") + 6; sourceCode = sourceCode.Substring(startIndex, sourceCode.Length - startIndex); endIndex = sourceCode.IndexOf("<"); int following = 0; string followingStr = sourceCode.Substring(0, endIndex).Trim().Replace(",", ""); if (!int.TryParse(followingStr, out following)) { following = -1; } // Followers // Its Number startIndex = sourceCode.IndexOf("stat-followers"); sourceCode = sourceCode.Substring(startIndex, sourceCode.Length - startIndex); startIndex = sourceCode.IndexOf("</div>") + 6; sourceCode = sourceCode.Substring(startIndex, sourceCode.Length - startIndex); endIndex = sourceCode.IndexOf("<"); int followers = 0; string followersStr = sourceCode.Substring(0, endIndex).Trim().Replace(",", ""); if (!int.TryParse(followersStr, out followers)) { followers = -1; } // Following Stocks stockList startIndex = sourceCode.IndexOf("member-stockList"); string stockLists = ""; if (startIndex != -1) { sourceCode = sourceCode.Substring(startIndex, sourceCode.Length - startIndex); endIndex = sourceCode.IndexOf("</ol"); string stockCode = sourceCode.Substring(0, endIndex); int count = 0; while (stockCode.IndexOf("<li>") != -1) { startIndex = stockCode.IndexOf("<li>") + 4; stockCode = stockCode.Substring(startIndex, stockCode.Length - startIndex); startIndex = stockCode.IndexOf("title=") + 6; stockCode = stockCode.Substring(startIndex, stockCode.Length - startIndex); startIndex = stockCode.IndexOf("\"") + 1; stockCode = stockCode.Substring(startIndex, stockCode.Length - startIndex); endIndex = stockCode.IndexOf("\""); stockLists += stockCode.Substring(0, endIndex) + "/space"; count++; if (stockLists.Contains("<a")) { endIndex = stockLists.IndexOf("<a"); string firstPart = stockLists.Substring(0, endIndex); stockLists = stockLists.Substring(endIndex + 2, stockLists.Length - (endIndex + 2)); startIndex = stockLists.IndexOf(">") + 1; string secondPart = stockLists.Substring(startIndex, stockLists.Length - startIndex); stockLists = firstPart + "\n" + secondPart; } if (stockLists.Contains("<span")) { endIndex = stockLists.IndexOf("<span"); string firstPart = stockLists.Substring(0, endIndex); stockLists = stockLists.Substring(endIndex + 5, stockLists.Length - (endIndex + 5)); startIndex = stockLists.IndexOf(">") + 1; string secondPart = stockLists.Substring(startIndex, stockLists.Length - startIndex); stockLists = firstPart + "\n" + secondPart; } stockLists = stockLists.Replace("<li>", ""); stockLists = stockLists.Replace("\n", ""); stockLists = stockLists.Replace("</a>", "").Trim(); stockLists = stockLists.Replace("'", "'"); } stockLists = count + " " + stockLists.Replace("/space", ", "); } // Following - Its name lists string followingLists = ""; if (following > 0) { startIndex = sourceCode.IndexOf("Following " + following + " members"); sourceCode = sourceCode.Substring(startIndex, sourceCode.Length - startIndex); startIndex = sourceCode.IndexOf("<ol>") + 4; sourceCode = sourceCode.Substring(startIndex, sourceCode.Length - startIndex); endIndex = sourceCode.IndexOf("</ol"); string followingListsCode = sourceCode.Substring(0, endIndex); while (followingListsCode.IndexOf("<li>") != -1) { startIndex = followingListsCode.IndexOf("<li>") + 4; followingListsCode = followingListsCode.Substring(startIndex, followingListsCode.Length - startIndex); endIndex = followingListsCode.IndexOf("</"); followingLists += followingListsCode.Substring(0, endIndex) + "/space"; if (followingLists.Contains("<a")) { endIndex = followingLists.IndexOf("<a"); string firstPart = followingLists.Substring(0, endIndex); followingLists = followingLists.Substring(endIndex + 2, followingLists.Length - (endIndex + 2)); startIndex = followingLists.IndexOf(">") + 1; string secondPart = followingLists.Substring(startIndex, followingLists.Length - startIndex); followingLists = firstPart + "\n" + secondPart; } if (followingLists.Contains("<span")) { endIndex = followingLists.IndexOf("<span"); string firstPart = followingLists.Substring(0, endIndex); followingLists = followingLists.Substring(endIndex + 5, followingLists.Length - (endIndex + 5)); startIndex = followingLists.IndexOf(">") + 1; string secondPart = followingLists.Substring(startIndex, followingLists.Length - startIndex); followingLists = firstPart + "\n" + secondPart; } followingLists = followingLists.Replace("<li>", ""); followingLists = followingLists.Replace("\n", ""); followingLists = followingLists.Replace(" ", ""); followingLists = followingLists.Replace("</a>", "").Trim(); followingLists = followingLists.Replace("'", "'"); } followingLists = followingLists.Replace("/space", ", "); } // Followers - Its name lists string followersLists = ""; if (followers > 0) { startIndex = sourceCode.IndexOf("Followed by " + followers + " members"); sourceCode = sourceCode.Substring(startIndex, sourceCode.Length - startIndex); startIndex = sourceCode.IndexOf("<ol>") + 4; sourceCode = sourceCode.Substring(startIndex, sourceCode.Length - startIndex); endIndex = sourceCode.IndexOf("</ol"); string followersListsCode = sourceCode.Substring(0, endIndex); while (followersListsCode.IndexOf("<li>") != -1) { startIndex = followersListsCode.IndexOf("<li>") + 4; followersListsCode = followersListsCode.Substring(startIndex, followersListsCode.Length - startIndex); endIndex = followersListsCode.IndexOf("</"); followersLists += followersListsCode.Substring(0, endIndex) + "/space"; if (followersLists.Contains("<a")) { endIndex = followersLists.IndexOf("<a"); string firstPart = followersLists.Substring(0, endIndex); followersLists = followersLists.Substring(endIndex + 2, followersLists.Length - (endIndex + 2)); startIndex = followersLists.IndexOf(">") + 1; string secondPart = followersLists.Substring(startIndex, followersLists.Length - startIndex); followersLists = firstPart + "\n" + secondPart; } if (followersLists.Contains("<span")) { endIndex = followersLists.IndexOf("<span"); string firstPart = followersLists.Substring(0, endIndex); followersLists = followersLists.Substring(endIndex + 5, followersLists.Length - (endIndex + 5)); startIndex = followersLists.IndexOf(">") + 1; string secondPart = followersLists.Substring(startIndex, followersLists.Length - startIndex); followersLists = firstPart + "\n" + secondPart; } followersLists = followersLists.Replace("<li>", ""); followersLists = followersLists.Replace("\n", ""); followersLists = followersLists.Replace(" ", ""); followersLists = followersLists.Replace("</a>", "").Trim(); followersLists = followersLists.Replace("'", "'"); } followersLists = followersLists.Replace("/space", ", "); } // Num of posts in a calendar month int numofPostsinaCalendarMonth = numOfPostsInAMonth(authorPostLink); if (!db.Authors.Any(f => f.Name == authorName)) { db.Authors.Add(new HCDB_Authors { Name = authorName, Num_of_Posts = postsTotalNum, Likes_Received = likesReceived, Followers = followers, Followers_List = followersLists, Following = following, Following_List = followingLists, Following_Stocks = stockLists, Num_of_Posts_in_calendar_month = numofPostsinaCalendarMonth }); db.SaveChanges(); } else { var existing = (from u in db.Authors where u.Name == authorName select u).FirstOrDefault(); if (existing != null) { if (existing.Num_of_Posts != postsTotalNum) { existing.Num_of_Posts = postsTotalNum; } if (existing.Likes_Received != likesReceived) { existing.Likes_Received = likesReceived; } if (existing.Followers != followers) { existing.Followers = followers; } if (existing.Following != following) { existing.Following = following; } if (existing.Followers_List != followersLists) { existing.Followers_List = followersLists; } if (existing.Following_List != followingLists) { existing.Following_List = followingLists; } if (existing.Num_of_Posts_in_calendar_month != numofPostsinaCalendarMonth) { existing.Num_of_Posts_in_calendar_month = numofPostsinaCalendarMonth; } if (existing.Following_Stocks != stockLists) { existing.Following_Stocks = stockLists; } db.SaveChanges(); } } } catch (Exception ex) { Console.WriteLine("Error" + ex.ToString()); Console.WriteLine("Error: " + ex.Message); // May not display in the form } }
// MARKET DATA private void MarketDataButton(object sender, EventArgs e) { var db = new HCDB(); int newData = 0; // Retreive a source code from a webpage string url = textBox1.Text; // e.g. http://hotcopper.com.au/asx/anz#.VI98gSuUfJI MessageBox.Show("This runs"); if (url != null && url.Trim() != "") { try { string sourceCode = WorkerClasses.getSourceCode(url); if (sourceCode == "invalid") { throw new UriFormatException(); } listbox.Items.Add("[" + DateTime.Now + "] Process Starts. Please wait for a few minutes."); /* TAG */ string groupWord = textBox2.Text; if (groupWord == "") { groupWord = WorkerClasses.getGroupWord(url); } int startIndex = 0; int endIndex = 0; startIndex = sourceCode.IndexOf("stock-pricing"); sourceCode = sourceCode.Substring(startIndex, sourceCode.Length - startIndex); // Open startIndex = sourceCode.IndexOf("class=\"primary\""); sourceCode = sourceCode.Substring(startIndex, sourceCode.Length - startIndex); startIndex = sourceCode.IndexOf(">") + 1; sourceCode = sourceCode.Substring(startIndex, sourceCode.Length - startIndex); endIndex = sourceCode.IndexOf("</"); string temp = sourceCode.Substring(0, endIndex).Replace(",", ""); MessageBox.Show("Open is currently " + temp); temp = temp.Replace("$", ""); if (temp.Contains("")) { temp = temp.Replace("¢", ""); temp = temp.Replace("M", ""); temp = Convert.ToString(Convert.ToDouble(temp) / 100); } Decimal openValue = Convert.ToDecimal(temp); // High startIndex = sourceCode.IndexOf("class=\"high\""); sourceCode = sourceCode.Substring(startIndex, sourceCode.Length - startIndex); startIndex = sourceCode.IndexOf(">") + 1; sourceCode = sourceCode.Substring(startIndex, sourceCode.Length - startIndex); endIndex = sourceCode.IndexOf("</"); temp = sourceCode.Substring(0, endIndex).Replace(",", ""); temp = temp.Replace("$", ""); if (temp.Contains("")) { temp = temp.Replace("¢", ""); temp = Convert.ToString(Convert.ToDouble(temp) / 100); } Decimal highValue = Convert.ToDecimal(temp); // Low startIndex = sourceCode.IndexOf("class=\"low\""); sourceCode = sourceCode.Substring(startIndex, sourceCode.Length - startIndex); startIndex = sourceCode.IndexOf(">") + 1; sourceCode = sourceCode.Substring(startIndex, sourceCode.Length - startIndex); endIndex = sourceCode.IndexOf("</"); temp = sourceCode.Substring(0, endIndex).Replace(",", ""); temp = temp.Replace("$", ""); if (temp.Contains("")) { temp = temp.Replace("¢", ""); temp = Convert.ToString(Convert.ToDouble(temp) / 100); } Decimal lowValue = Convert.ToDecimal(temp); // Value startIndex = sourceCode.IndexOf("class=\"primary\""); sourceCode = sourceCode.Substring(startIndex, sourceCode.Length - startIndex); startIndex = sourceCode.IndexOf(">") + 1; sourceCode = sourceCode.Substring(startIndex, sourceCode.Length - startIndex); endIndex = sourceCode.IndexOf("</"); temp = sourceCode.Substring(0, endIndex).Replace(",", ""); temp = temp.Replace("$", ""); if (temp.Contains("")) { temp = temp.Replace("¢", ""); temp = temp.Replace("M", ""); temp = Convert.ToString(Convert.ToDouble(temp) / 100); } Decimal lastValue = Convert.ToDecimal(temp); // Volume startIndex = sourceCode.IndexOf("class=\"primary\""); sourceCode = sourceCode.Substring(startIndex, sourceCode.Length - startIndex); startIndex = sourceCode.IndexOf(">") + 1; sourceCode = sourceCode.Substring(startIndex, sourceCode.Length - startIndex); endIndex = sourceCode.IndexOf("</"); temp = sourceCode.Substring(0, endIndex).Replace(",", ""); temp = temp.Replace("$", ""); if (temp.Contains("")) { temp = temp.Replace("¢", ""); temp = temp.Replace("M", ""); temp = Convert.ToString(Convert.ToDouble(temp) / 100); } Decimal marketPrice = Convert.ToDecimal(temp); /*// Volume (Millions) * startIndex = sourceCode.IndexOf("class=\"primary\""); * sourceCode = sourceCode.Substring(startIndex, sourceCode.Length - startIndex); * startIndex = sourceCode.IndexOf(">") + 1; * sourceCode = sourceCode.Substring(startIndex, sourceCode.Length - startIndex); * endIndex = sourceCode.IndexOf("</"); * temp = sourceCode.Substring(0, endIndex).Replace(",", ""); * if (temp.Contains("m") || temp.Contains("M")) * temp = temp.Replace("m", "").Replace("M", ""); * else if (temp.Contains("b") || temp.Contains("B")) * { * temp = temp.Replace("b", "").Replace("B", ""); * temp = Convert.ToString(Convert.ToDecimal(temp) * 1000); * } * else if (temp.Contains("k") || temp.Contains("K")) * { * temp = temp.Replace("k", "").Replace("K", ""); * temp = Convert.ToString(Convert.ToDecimal(temp) / 1000); * } * else * temp = Convert.ToString(Convert.ToDecimal(temp) / 1000000); * Decimal volume = Convert.ToDecimal(temp); * * // Value (Millions) * startIndex = sourceCode.IndexOf("class=\"primary\""); * sourceCode = sourceCode.Substring(startIndex, sourceCode.Length - startIndex); * startIndex = sourceCode.IndexOf("$") + 1; * sourceCode = sourceCode.Substring(startIndex, sourceCode.Length - startIndex); * endIndex = sourceCode.IndexOf("</"); * temp = sourceCode.Substring(0, endIndex).Replace(",", ""); * if (temp.Contains("m") || temp.Contains("M")) * temp = temp.Replace("m", "").Replace("M", ""); * else if (temp.Contains("b") || temp.Contains("B")) * { * temp = temp.Replace("b", "").Replace("B", ""); * temp = Convert.ToString(Convert.ToDecimal(temp) * 1000); * } * else if (temp.Contains("k") || temp.Contains("K")) * { * temp = temp.Replace("k", "").Replace("K", ""); * temp = Convert.ToString(Convert.ToDecimal(temp) / 1000); * } * else * temp = Convert.ToString(Convert.ToDecimal(temp) / 1000); * Decimal value = Convert.ToDecimal(temp); * * // Market Cap (Billions) * startIndex = sourceCode.IndexOf("class=\"primary\""); * sourceCode = sourceCode.Substring(startIndex, sourceCode.Length - startIndex); * startIndex = sourceCode.IndexOf("$") + 1; * sourceCode = sourceCode.Substring(startIndex, sourceCode.Length - startIndex); * endIndex = sourceCode.IndexOf("</"); * temp = sourceCode.Substring(0, endIndex).Replace(",", ""); * if (temp.Contains("b") || temp.Contains("B")) * temp = temp.Replace("b", "").Replace("B", ""); * else if (temp.Contains("m") || temp.Contains("M")) * { * temp = temp.Replace("m", "").Replace("M", ""); * temp = Convert.ToString(Convert.ToDecimal(temp) / 1000); * } * else if (temp.Contains("k") || temp.Contains("K")) * { * temp = temp.Replace("k", "").Replace("K", ""); * temp = Convert.ToString(Convert.ToDecimal(temp) / 1000000); * } * else * temp = Convert.ToString(Convert.ToDecimal(temp) / 1000000000); * Decimal marketCap = Convert.ToDecimal(temp); * */ db.MarketData.Add(new HCDB_MarketData { Tag = groupWord, Date = DateTime.Now, High = highValue, Low = lowValue, Open = openValue, Last = lastValue, Market_Price = marketPrice, Volume__Millions_ = 0M, Value__Millions_ = 0M, Market_Cap__Billions_ = 0M }); db.SaveChanges(); newData++; listbox.Items.Add("\n[" + DateTime.Now + "] Market Data completed.\n" + newData + " saved."); } catch (UriFormatException) { listbox.Items.Add("Invalid URL!"); MessageBox.Show("Invalid URL!"); textBox1.Text = ""; } catch (Exception ex) { listbox.Items.Add("Error found: " + ex); MessageBox.Show("Error found: " + ex); textBox1.Text = ""; } } }
// Positive and Negative words private void SentimentAnalysisButton(object sender, EventArgs e) { var db = new HCDB(); int duplicates = 0; int updatedData = 0; // Retreive a source code from a webpage // e.g. http://www3.nd.edu/~mcdonald/Word_Lists.html try { listbox.Items.Add("Process Starts. Please wait for a few minutes."); string word = ""; int posCount = 0; int negCount = 0; // Positive Words string positiveWordLink = "http://www3.nd.edu/~mcdonald/Data/Finance_Word_Lists/LoughranMcDonald_Positive.csv"; string positiveCode = WorkerClasses.getSourceCode(positiveWordLink).ToLower().Replace("\n", ""); if (positiveCode == "invalid") { throw new UriFormatException(); } string positiveCode_copy = positiveCode; // Negative Words string negativeWordLink = "http://www3.nd.edu/~mcdonald/Data/Finance_Word_Lists/LoughranMcDonald_Negative.csv"; string negativeCode = WorkerClasses.getSourceCode(negativeWordLink).ToLower().Replace("\n", ""); if (negativeCode == "invalid") { throw new UriFormatException(); } string negativeCode_copy = negativeCode; // Check with the article's story if a word in the list is contained in a story // if it is contained, count how many times. var postsList = (from u in db.Posts select u).ToList(); if (postsList.Count != 0) { foreach (var u in postsList) { if (u.PosWords == null && u.NegWords == null) { string content = u.Content; string[] contentSplit = content.Split(new char[] { '.', '?', '!', ' ', ';', ':', ',' }, StringSplitOptions.RemoveEmptyEntries); // Select a word (read by line) in the list while (positiveCode.IndexOf("\r") != -1) { int endIndex = positiveCode.IndexOf("\r"); word = positiveCode.Substring(0, endIndex); positiveCode = positiveCode.Substring(endIndex + 1, positiveCode.Length - endIndex - 1); if (content.Contains(word)) { var matchQuery = from words in contentSplit where words.ToLowerInvariant() == word.ToLowerInvariant() select words; posCount += matchQuery.Count(); } } while (negativeCode.IndexOf("\r") != -1) { int endIndex = negativeCode.IndexOf("\r"); word = negativeCode.Substring(0, endIndex); negativeCode = negativeCode.Substring(endIndex + 1, negativeCode.Length - endIndex - 1); if (content.Contains(word)) { var matchQuery = from words in contentSplit where words.ToLowerInvariant() == word.ToLowerInvariant() select words; negCount += matchQuery.Count(); } } u.PosWords = posCount; u.NegWords = negCount; db.SaveChanges(); updatedData++; // Reset components for next loop positiveCode = positiveCode_copy; negativeCode = negativeCode_copy; posCount = 0; negCount = 0; } else { duplicates++; } } } listbox.Items.Add("\n[" + DateTime.Now + "] HotCopper Pos/Neg Word Process Ended.\n" + updatedData + " updated and " + duplicates + " duplicates Found."); } catch (Exception ex) { if (ex.Message.Contains("UriFormatException")) { listbox.Items.Add("Invalid URL!"); MessageBox.Show("Invalid URL!"); } else { listbox.Items.Add(ex.Message); } } }
private void GetHotCopperThreads(string pageSource) { var db = new HCDB(); int duplicates = 0; int newData = 0; int postNewData = 0; if (pageSource != null) { try { string stock = textBox2.Text; if (stock == null || stock.Trim() == "") { int identifier = url.IndexOf("hotcopper.com.au") + 16; if (url != "") { stock = url.Substring(identifier, url.Length - identifier); if (stock.Contains("#")) { stock = stock.Substring(0, stock.IndexOf("#")); } } } string sourceCode = pageSource; if (sourceCode == "invalid") { throw new UriFormatException(); } listbox.Items.Add("Process Starts. Please wait for a few minutes."); /* Group */ string groupWord = textBox2.Text; if (groupWord == "") { groupWord = WorkerClasses.getGroupWord(url); } #region HotCopper THREADS results only try { /* First of ALL, Save Threads Links */ while (sourceCode.IndexOf("listblock tags") != -1) { // TAG int startIndex = sourceCode.IndexOf("listblock tags"); sourceCode = sourceCode.Substring(startIndex, sourceCode.Length - startIndex); int endIndex = sourceCode.IndexOf("</div"); string relCode = sourceCode.Substring(0, endIndex); string tags = ""; while (relCode.Contains("<a href")) { startIndex = relCode.IndexOf("<a href"); relCode = relCode.Substring(startIndex, relCode.Length - startIndex); startIndex = relCode.IndexOf(">") + 1; relCode = relCode.Substring(startIndex, relCode.Length - startIndex); endIndex = relCode.IndexOf("</"); tags += " " + relCode.Substring(0, endIndex); } tags = tags.Replace("&", "&").Trim(); // Subject startIndex = sourceCode.IndexOf("listblock subject"); sourceCode = sourceCode.Substring(startIndex, sourceCode.Length - startIndex); startIndex = sourceCode.IndexOf("<h"); sourceCode = sourceCode.Substring(startIndex, sourceCode.Length - startIndex); startIndex = sourceCode.IndexOf(">") + 1; sourceCode = sourceCode.Substring(startIndex, sourceCode.Length - startIndex); endIndex = sourceCode.IndexOf("</"); string subject = sourceCode.Substring(0, endIndex); string threadLink = ""; if (subject.Contains("<a")) { // Thread Link string tempCode = sourceCode; startIndex = tempCode.IndexOf("<a href"); tempCode = tempCode.Substring(startIndex, tempCode.Length - startIndex); startIndex = tempCode.IndexOf("\"") + 1; tempCode = tempCode.Substring(startIndex, tempCode.Length - startIndex); endIndex = tempCode.IndexOf("\""); threadLink = "http://hotcopper.com.au/" + tempCode.Substring(0, endIndex); // Remove "<a>" from Subject endIndex = subject.IndexOf("<a"); string firstPart = subject.Substring(0, endIndex); subject = subject.Substring(endIndex + 2, subject.Length - (endIndex + 2)); startIndex = subject.IndexOf(">") + 1; string secondPart = subject.Substring(startIndex, subject.Length - startIndex); subject = firstPart + "\n" + secondPart; } subject = subject.Replace("</a>", "").Trim(); subject = subject.Replace("'", "'"); subject = subject.Replace("&", "'"); // Author startIndex = sourceCode.IndexOf("listblock author "); sourceCode = sourceCode.Substring(startIndex, sourceCode.Length - startIndex); startIndex = sourceCode.IndexOf(">") + 1; sourceCode = sourceCode.Substring(startIndex, sourceCode.Length - startIndex); endIndex = sourceCode.IndexOf("</"); string author = sourceCode.Substring(0, endIndex); // Author's Posts Link startIndex = sourceCode.IndexOf("a href"); sourceCode = sourceCode.Substring(startIndex, sourceCode.Length - startIndex); startIndex = sourceCode.IndexOf("\"") + 1; sourceCode = sourceCode.Substring(startIndex, sourceCode.Length - startIndex); endIndex = sourceCode.IndexOf("\""); string authorPostLink = "https://hotcopper.com.au/" + sourceCode.Substring(0, endIndex); authorPostLink = authorPostLink.Replace("&", "&"); while (author.Contains("<a")) { endIndex = author.IndexOf("<a"); string firstPart = author.Substring(0, endIndex); author = author.Substring(endIndex + 2, author.Length - (endIndex + 2)); startIndex = author.IndexOf(">") + 1; string secondPart = author.Substring(startIndex, author.Length - startIndex); author = firstPart + "\n" + secondPart; } author = author.Replace("</a>", "").Trim(); author = author.Replace("'", "'"); // Check the subject if it is not a reply and not an announcement (First Post only) if (!subject.Substring(0, 4).ToLower().Contains("re:") && !subject.Substring(0, 4).ToLower().Contains("ann:")) { // Views startIndex = sourceCode.IndexOf("listblock stats "); sourceCode = sourceCode.Substring(startIndex, sourceCode.Length - startIndex); startIndex = sourceCode.IndexOf("</span>") + 7; if (startIndex == -1) { startIndex = sourceCode.IndexOf(">") + 1; } sourceCode = sourceCode.Substring(startIndex, sourceCode.Length - startIndex); endIndex = sourceCode.IndexOf("</"); string viewStr = sourceCode.Substring(0, endIndex).Trim(); viewStr = viewStr.Replace(",", ""); int view = Convert.ToInt32(viewStr); //// Immediate DateTime startIndex = sourceCode.IndexOf("listblock time"); sourceCode = sourceCode.Substring(startIndex, sourceCode.Length - startIndex); startIndex = sourceCode.IndexOf(">") + 1; sourceCode = sourceCode.Substring(startIndex, sourceCode.Length - startIndex); startIndex = sourceCode.IndexOf(">") + 1; endIndex = sourceCode.IndexOf("</"); if (startIndex < endIndex && startIndex != -1) { sourceCode = sourceCode.Substring(startIndex, sourceCode.Length - startIndex); } endIndex = sourceCode.IndexOf("</"); string datetimeStr = sourceCode.Substring(0, endIndex).Trim(); DateTime threadDate = Convert.ToDateTime(datetimeStr); // Access inside the thread if (threadLink != "") { string threadCode = WorkerClasses.getSourceCode(threadLink); if (threadCode == "invalid") { throw new UriFormatException(); } // Date of the thread startIndex = threadCode.IndexOf("icon left icon-clock"); threadCode = threadCode.Substring(startIndex, threadCode.Length - startIndex); startIndex = threadCode.IndexOf("</span>") + 7; threadCode = threadCode.Substring(startIndex, threadCode.Length - startIndex); endIndex = threadCode.IndexOf("</"); string threadBeginStr = threadCode.Substring(0, endIndex).Trim(); DateTime threadBegin = Convert.ToDateTime(datetimeStr); // if Nav to First does not exist, this post is the last one, save as it is. int totalPosts = 1; string lastPoster = author; string lastPost = threadLink; string threadCodeOrg = threadCode; startIndex = threadCode.IndexOf("rel=\"start\""); // Nav to First // if first exists, Check 'PageNav', find a total number, then go to last post, // find save last post link and last poster. //................................. UPDATE AREA ................................// if (startIndex != -1) { // Total Number of posts startIndex = threadCode.IndexOf("PageNav"); threadCode = threadCode.Substring(startIndex, threadCode.Length - startIndex); startIndex = threadCode.IndexOf("data-last") + 9; threadCode = threadCode.Substring(startIndex, threadCode.Length - startIndex); startIndex = threadCode.IndexOf("\"") + 1; threadCode = threadCode.Substring(startIndex, threadCode.Length - startIndex); endIndex = threadCode.IndexOf("\""); totalPosts = Convert.ToInt32(threadCode.Substring(0, endIndex)); // ------:> Navigate to Last Post startIndex = threadCode.IndexOf("Next</a>"); threadCode = threadCode.Substring(startIndex, threadCode.Length - startIndex); startIndex = threadCode.IndexOf("a href"); threadCode = threadCode.Substring(startIndex, threadCode.Length - startIndex); startIndex = threadCode.IndexOf("\"") + 1; threadCode = threadCode.Substring(startIndex, threadCode.Length - startIndex); endIndex = threadCode.IndexOf("\""); string lastPostLink = "http://hotcopper.com.au/" + threadCode.Substring(0, endIndex); string lastPostCode = WorkerClasses.getSourceCode(lastPostLink); if (lastPostCode == "invalid") { throw new UriFormatException(); } // Last Post lastPost = lastPostLink; // Last Poster startIndex = lastPostCode.IndexOf("user-wrap"); lastPostCode = lastPostCode.Substring(startIndex, lastPostCode.Length - startIndex); startIndex = lastPostCode.IndexOf("<h"); lastPostCode = lastPostCode.Substring(startIndex, lastPostCode.Length - startIndex); startIndex = lastPostCode.IndexOf(">") + 1; lastPostCode = lastPostCode.Substring(startIndex, lastPostCode.Length - startIndex); startIndex = lastPostCode.IndexOf(">") + 1; endIndex = lastPostCode.IndexOf("</"); lastPoster = lastPostCode.Substring(0, endIndex); while (lastPoster.Contains("<a")) { endIndex = lastPoster.IndexOf("<a"); string firstPart = lastPoster.Substring(0, endIndex); lastPoster = lastPoster.Substring(endIndex + 2, lastPoster.Length - (endIndex + 2)); startIndex = lastPoster.IndexOf(">") + 1; string secondPart = lastPoster.Substring(startIndex, lastPoster.Length - startIndex); lastPoster = firstPart + "\n" + secondPart; } lastPoster = lastPoster.Replace("</a>", "").Trim(); lastPoster = lastPoster.Replace("'", "'"); } if (!db.Threads.Any(f => f.Subject == subject && f.Begin_Date == threadBegin)) { db.Threads.Add(new HCDB_Threads { Stock = stock, Tags = tags, Subject = subject, Num_of_Posts = totalPosts, Num_of_Views = view, First_Poster = author, Begin_Date = threadBegin, Last_Post = lastPost, Last_Poster = lastPoster }); db.SaveChanges(); //MessageBox.Show("Count " + newData); Console.WriteLine("Count " + newData); newData++; } else { var existing = (from u in db.Threads where u.Subject == subject && u.Begin_Date == threadBegin select u).FirstOrDefault(); if (existing != null) { if (existing.Last_Post != lastPost) { existing.Last_Post = lastPost; } if (existing.Last_Poster != lastPoster) { existing.Last_Poster = lastPoster; } if (existing.Num_of_Posts != totalPosts) { existing.Num_of_Posts = totalPosts; } if (existing.Num_of_Views != view) { existing.Num_of_Views = view; } db.SaveChanges(); duplicates++; } } // Save Page Data postNewData += WorkerClasses.savePageData(stock, subject, threadCodeOrg, threadLink, authorPostLink); } } // Second, Third, Forth and after posts... else if (subject.Substring(0, 4).ToLower().Contains("re:")) { string threadCodeOrg = WorkerClasses.getSourceCode(threadLink); if (threadCodeOrg == "invalid") { throw new UriFormatException(); } // Save Page Data postNewData += WorkerClasses.savePageData(stock, subject, threadCodeOrg, threadLink, authorPostLink); } } } catch (Exception ex) { MessageBox.Show(ex.Message); MessageBox.Show(ex.ToString()); listbox.Items.Add("Error: " + ex); } listbox.Items.Add("\n[" + DateTime.Now + "] HotCopper Threads completed.\n" + newData + " saved and " + duplicates + " duplicates Updated."); listbox.Items.Add("\n[" + DateTime.Now + "] HotCopper Posts completed.\n" + postNewData + " saved."); #endregion } catch (Exception ex) { MessageBox.Show(ex.Message); MessageBox.Show(ex.ToString()); listbox.Items.Add("Invalid URL!"); MessageBox.Show("Invalid URL!"); textBox1.Text = ""; } } else { listbox.Items.Add("Please enter URL."); MessageBox.Show("Please enter URL."); } }