public override IEnumerable <Row> Process(RowSet input, Row outputRow, string[] args) { foreach (Row row in input.Rows) { String ddate; DateTime dateObject; String currentdate = ""; String currenthour = ""; String currentminute = ""; string entityPrintable = ""; CultureInfo cultureInfo = System.Threading.Thread.CurrentThread.CurrentCulture; TextInfo textInfo = cultureInfo.TextInfo; string entityOriginal = ""; int userAgeInDays = 0; //Parse Tweet's createdTime try { ddate = TextFunctions.DateFormatUtc(TextFunctions.Strip(row["tCreatedAt"].ToString()), "0", 0.0); dateObject = DateTime.Parse(ddate); currentdate = dateObject.Date.ToString("yyyy-MM-dd"); currenthour = dateObject.ToString("HH"); currentminute = dateObject.ToString("mm"); } catch (Exception e) { continue; } //Parse User's createdTime try { userAgeInDays = Convert.ToInt32((dateObject - DateTime.Parse(TextFunctions.DateFormatUtc(TextFunctions.Strip(row["uCreatedAt"].ToString()), "0", 0.0))).TotalDays); if (userAgeInDays < 30) { continue; } } catch (Exception e) { continue; } string targeturl = TextFunctions.StripChar(row["tUrl"].String).Trim(); string printableUrl = targeturl; if (targeturl != "") { if (targeturl.Contains("http://www.")) { targeturl = targeturl.Replace("http://www.", ""); } else if (targeturl.Contains("https://www.")) { targeturl = targeturl.Replace("https://www.", ""); } else if (targeturl.Contains("http://")) { targeturl = targeturl.Replace("http://", ""); } else if (targeturl.Contains("https://")) { targeturl = targeturl.Replace("https://", ""); } targeturl = TextFunctions.removeURLAfterHash(targeturl); } if (targeturl.Contains("itunes.apple.com")) { continue; } if (targeturl.Contains("store.apple.com")) { continue; } int isNews = 0; if (row["tLinkAnnotations"].String.Contains("newsclassifier.is_news_domain\",\"Value\":\"1\"") && !targeturl.Contains("mtv.com") && !targeturl.Contains("popsugar.com")) { isNews = 1; } HashSet <string> alreadyPrinted = new HashSet <string>(); string title = TextFunctions.cleanString(TextFunctions.StripChar(row["tTitle"].String)).Trim(); String[] entities = new String[] { row["tNamedEntityFirst"].String.Trim(), row["tNamedEntitySecond"].String.Trim(), row["tNamedEntityThird"].String.Trim() }; String[] entityTypes = new String[] { row["tNamedEntityCategoryFirst"].String.Trim(), row["tNamedEntityCategorySecond"].String.Trim(), row["tNamedEntityCategoryThird"].String.Trim() }; int ctr = -1; //Process explicit entities List <Entity> entityList = new List <Entity>(); foreach (string entity in entities) { ctr++; if (entity == "") { continue; } if (entity.Length < 3) { continue; } if (entityTypes[ctr] == "DATE" || entityTypes[ctr] == "TIME-POINT" || entityTypes[ctr] == "ZIP") { continue; } entityOriginal = entity.Replace(" _ ", " ").Trim(); entityPrintable = entityOriginal; if ((entityOriginal.ToLower() == entityOriginal)) { entityPrintable = textInfo.ToTitleCase(entityOriginal.ToLower()); } if (alreadyPrinted.Contains(entityOriginal.ToLower())) { continue; } alreadyPrinted.Add(entityOriginal.ToLower()); if (IsEntityExtractedFromDomain(targeturl, entityOriginal)) { continue; } Entity explictEntity = new Entity() { entity = entityOriginal.ToLower(), entityPrintable = entityPrintable, entitytype = "entity", entitysubtypetype = entityTypes[ctr], entitySource = "Original" }; entityList.Add(explictEntity); } //Process ngrams string textNgrams = TextFunctions.generateNgrams(row["tText"].String, 1, 4); string[] tokens = textNgrams.Trim().Split(';'); string typeOfNgramType = ""; string previousEntity = ""; foreach (string token in tokens) { if (token.Length < 3) { continue; } if (token[0] == '#') { typeOfNgramType = "hashtag"; } else if (token[0] == '@') { typeOfNgramType = "username"; } else if (("@" + token) == previousEntity) { typeOfNgramType = "username"; } else { typeOfNgramType = "ngram"; } if (alreadyPrinted.Contains(token.ToLower())) { continue; } if (IsEntityExtractedFromDomain(targeturl, token)) { continue; } previousEntity = token; alreadyPrinted.Add(token.ToLower()); Entity tweetTextNGram = new Entity() { entity = token.ToLower(), entityPrintable = token, entitytype = typeOfNgramType, entitysubtypetype = string.Empty, entitySource = "Text" }; entityList.Add(tweetTextNGram); } /////////////////////////////HEADLINE/////////////////////// (should be news document's title) string textNgrams1 = ""; if (isNews == 1) { textNgrams1 = TextFunctions.generateNgrams(title, 1, 4); } string[] tokens1 = textNgrams1.Trim().Split(';'); foreach (string token in tokens1) { if (token.Length < 3) { continue; } if (token[0] == '#') { typeOfNgramType = "hashtag"; } else if (token[0] == '@') { typeOfNgramType = "username"; } else if (("@" + token) == previousEntity) { typeOfNgramType = "username"; } else { typeOfNgramType = "ngram"; } if (alreadyPrinted.Contains(token.ToLower())) { continue; } if (IsEntityExtractedFromDomain(targeturl, token)) { continue; } previousEntity = token; alreadyPrinted.Add(token.ToLower()); Entity titleTextNGram = new Entity() { entity = token.ToLower(), entityPrintable = token, entitytype = typeOfNgramType, entitysubtypetype = string.Empty, entitySource = "Title" }; entityList.Add(titleTextNGram); } outputRow["dateCreation"].Set(currentdate); outputRow["hourCreation"].Set(currenthour); outputRow["minuteCreation"].Set(currentminute); outputRow["targeturl"].Set(targeturl); outputRow["domain"].Set(row["tDomain"].String.Trim()); outputRow["thumbnail"].Set(TextFunctions.ExtractThumbnail(row["tLinkAnnotations"].String)); outputRow["url"].Set(printableUrl); outputRow["isnewsurl"].Set(isNews); outputRow["tID"].Set(row["tID"].String); outputRow["uID"].Set(row["uID"].String); outputRow["uScreenName"].Set(row["uScreenName"].String); outputRow["uName"].Set(row["uName"].String); outputRow["userAgeInDays"].Set(userAgeInDays); outputRow["sentiment"].Set(0.0); if (isNews == 1) { outputRow["tweettext"].Set(TextFunctions.cleanString(title)); } else if (row["tDomain"].String == "youtube.com") { outputRow["tweettext"].Set(TextFunctions.cleanString(title)); } else { outputRow["tweettext"].Set(""); } outputRow["tIsRetweet"].Set(row["tIsRetweet"].String); /////////////////////////new columns///////////////////////// string text = TextFunctions.normalizedText(TextFunctions.cleanText(row["tText"].String)); outputRow["tText"].Set(text); row["tText"].CopyTo(outputRow["tRawText"]); outputRow["tTextFragments"].Set(row["tTextFragments"].String); outputRow["tPublishTime"].Set(TextFunctions.DateFormatFileUtc(TextFunctions.Strip(row["tCreatedAt"].ToString()), "0", 0.0)); outputRow["tRetweetCount"].Set(row["tRetweetCount"].String); outputRow["tSpamScore"].Set(row["tSpamScore"].String); outputRow["uAuthScore"].Set(row["uAuthScore"].String); outputRow["uFavoritesCount"].Set(row["uFavoritesCount"].String); outputRow["uFollowersCount"].Set(row["uFollowersCount"].String); outputRow["uFriendsCount"].Set(row["uFriendsCount"].String); outputRow["uProfile"].Set(TextFunctions.generateProfile(row["uID"].String, row["uName"].String, row["uScreenName"].String, row["uProfilePage"].String, row["uProfileImageUrl"].String, row["uVerified"].Boolean)); outputRow["uVerified"].Set(row["uVerified"].String); /////////////////////////new columns end///////////////////////// foreach (Entity entity in entityList) { outputRow["entity"].Set(entity.entity); outputRow["entityPrintable"].Set(entity.entityPrintable); outputRow["entitytype"].Set(entity.entitytype); outputRow["entitysubtypetype"].Set(entity.entitysubtypetype); outputRow["entitySource"].Set(entity.entitySource);//new column yield return(outputRow); } } }