Beispiel #1
0
    public override IEnumerable <Row> Process(RowSet input, Row outputRow, string[] args)
    {
        foreach (Row row in input.Rows)
        {
            String      ddate;
            DateTime    dateObject;
            String      currentdate     = "";
            String      currenthour     = "";
            String      currentminute   = "";
            string      entityPrintable = "";
            CultureInfo cultureInfo     = System.Threading.Thread.CurrentThread.CurrentCulture;
            TextInfo    textInfo        = cultureInfo.TextInfo;
            string      entityOriginal  = "";
            int         userAgeInDays   = 0;

            //Parse Tweet's createdTime
            try
            {
                ddate         = TextFunctions.DateFormatUtc(TextFunctions.Strip(row["tCreatedAt"].ToString()), "0", 0.0);
                dateObject    = DateTime.Parse(ddate);
                currentdate   = dateObject.Date.ToString("yyyy-MM-dd");
                currenthour   = dateObject.ToString("HH");
                currentminute = dateObject.ToString("mm");
            }
            catch (Exception e)
            {
                continue;
            }

            //Parse User's createdTime
            try
            {
                userAgeInDays = Convert.ToInt32((dateObject - DateTime.Parse(TextFunctions.DateFormatUtc(TextFunctions.Strip(row["uCreatedAt"].ToString()), "0", 0.0))).TotalDays);

                if (userAgeInDays < 30)
                {
                    continue;
                }
            }
            catch (Exception e)
            {
                continue;
            }

            string targeturl    = TextFunctions.StripChar(row["tUrl"].String).Trim();
            string printableUrl = targeturl;
            if (targeturl != "")
            {
                if (targeturl.Contains("http://www."))
                {
                    targeturl = targeturl.Replace("http://www.", "");
                }
                else if (targeturl.Contains("https://www."))
                {
                    targeturl = targeturl.Replace("https://www.", "");
                }
                else if (targeturl.Contains("http://"))
                {
                    targeturl = targeturl.Replace("http://", "");
                }
                else if (targeturl.Contains("https://"))
                {
                    targeturl = targeturl.Replace("https://", "");
                }
                targeturl = TextFunctions.removeURLAfterHash(targeturl);
            }

            if (targeturl.Contains("itunes.apple.com"))
            {
                continue;
            }
            if (targeturl.Contains("store.apple.com"))
            {
                continue;
            }

            int isNews = 0;
            if (row["tLinkAnnotations"].String.Contains("newsclassifier.is_news_domain\",\"Value\":\"1\"") && !targeturl.Contains("mtv.com") && !targeturl.Contains("popsugar.com"))
            {
                isNews = 1;
            }

            HashSet <string> alreadyPrinted = new HashSet <string>();
            string           title          = TextFunctions.cleanString(TextFunctions.StripChar(row["tTitle"].String)).Trim();
            String[]         entities       = new String[] { row["tNamedEntityFirst"].String.Trim(), row["tNamedEntitySecond"].String.Trim(), row["tNamedEntityThird"].String.Trim() };
            String[]         entityTypes    = new String[] { row["tNamedEntityCategoryFirst"].String.Trim(), row["tNamedEntityCategorySecond"].String.Trim(), row["tNamedEntityCategoryThird"].String.Trim() };
            int ctr = -1;

            //Process explicit entities
            List <Entity> entityList = new List <Entity>();
            foreach (string entity in entities)
            {
                ctr++;
                if (entity == "")
                {
                    continue;
                }
                if (entity.Length < 3)
                {
                    continue;
                }
                if (entityTypes[ctr] == "DATE" || entityTypes[ctr] == "TIME-POINT" || entityTypes[ctr] == "ZIP")
                {
                    continue;
                }
                entityOriginal  = entity.Replace(" _ ", " ").Trim();
                entityPrintable = entityOriginal;
                if ((entityOriginal.ToLower() == entityOriginal))
                {
                    entityPrintable = textInfo.ToTitleCase(entityOriginal.ToLower());
                }
                if (alreadyPrinted.Contains(entityOriginal.ToLower()))
                {
                    continue;
                }
                alreadyPrinted.Add(entityOriginal.ToLower());

                if (IsEntityExtractedFromDomain(targeturl, entityOriginal))
                {
                    continue;
                }

                Entity explictEntity = new Entity()
                {
                    entity = entityOriginal.ToLower(), entityPrintable = entityPrintable, entitytype = "entity", entitysubtypetype = entityTypes[ctr], entitySource = "Original"
                };
                entityList.Add(explictEntity);
            }

            //Process ngrams
            string   textNgrams      = TextFunctions.generateNgrams(row["tText"].String, 1, 4);
            string[] tokens          = textNgrams.Trim().Split(';');
            string   typeOfNgramType = "";
            string   previousEntity  = "";

            foreach (string token in tokens)
            {
                if (token.Length < 3)
                {
                    continue;
                }

                if (token[0] == '#')
                {
                    typeOfNgramType = "hashtag";
                }
                else if (token[0] == '@')
                {
                    typeOfNgramType = "username";
                }
                else if (("@" + token) == previousEntity)
                {
                    typeOfNgramType = "username";
                }
                else
                {
                    typeOfNgramType = "ngram";
                }

                if (alreadyPrinted.Contains(token.ToLower()))
                {
                    continue;
                }

                if (IsEntityExtractedFromDomain(targeturl, token))
                {
                    continue;
                }

                previousEntity = token;
                alreadyPrinted.Add(token.ToLower());

                Entity tweetTextNGram = new Entity()
                {
                    entity = token.ToLower(), entityPrintable = token, entitytype = typeOfNgramType, entitysubtypetype = string.Empty, entitySource = "Text"
                };
                entityList.Add(tweetTextNGram);
            }

            /////////////////////////////HEADLINE/////////////////////// (should be news document's title)
            string textNgrams1 = "";
            if (isNews == 1)
            {
                textNgrams1 = TextFunctions.generateNgrams(title, 1, 4);
            }
            string[] tokens1 = textNgrams1.Trim().Split(';');
            foreach (string token in tokens1)
            {
                if (token.Length < 3)
                {
                    continue;
                }

                if (token[0] == '#')
                {
                    typeOfNgramType = "hashtag";
                }
                else if (token[0] == '@')
                {
                    typeOfNgramType = "username";
                }
                else if (("@" + token) == previousEntity)
                {
                    typeOfNgramType = "username";
                }
                else
                {
                    typeOfNgramType = "ngram";
                }

                if (alreadyPrinted.Contains(token.ToLower()))
                {
                    continue;
                }

                if (IsEntityExtractedFromDomain(targeturl, token))
                {
                    continue;
                }

                previousEntity = token;
                alreadyPrinted.Add(token.ToLower());

                Entity titleTextNGram = new Entity()
                {
                    entity = token.ToLower(), entityPrintable = token, entitytype = typeOfNgramType, entitysubtypetype = string.Empty, entitySource = "Title"
                };
                entityList.Add(titleTextNGram);
            }

            outputRow["dateCreation"].Set(currentdate);
            outputRow["hourCreation"].Set(currenthour);
            outputRow["minuteCreation"].Set(currentminute);
            outputRow["targeturl"].Set(targeturl);
            outputRow["domain"].Set(row["tDomain"].String.Trim());
            outputRow["thumbnail"].Set(TextFunctions.ExtractThumbnail(row["tLinkAnnotations"].String));
            outputRow["url"].Set(printableUrl);
            outputRow["isnewsurl"].Set(isNews);
            outputRow["tID"].Set(row["tID"].String);
            outputRow["uID"].Set(row["uID"].String);
            outputRow["uScreenName"].Set(row["uScreenName"].String);
            outputRow["uName"].Set(row["uName"].String);
            outputRow["userAgeInDays"].Set(userAgeInDays);
            outputRow["sentiment"].Set(0.0);
            if (isNews == 1)
            {
                outputRow["tweettext"].Set(TextFunctions.cleanString(title));
            }
            else if (row["tDomain"].String == "youtube.com")
            {
                outputRow["tweettext"].Set(TextFunctions.cleanString(title));
            }
            else
            {
                outputRow["tweettext"].Set("");
            }
            outputRow["tIsRetweet"].Set(row["tIsRetweet"].String);

            /////////////////////////new columns/////////////////////////
            string text = TextFunctions.normalizedText(TextFunctions.cleanText(row["tText"].String));
            outputRow["tText"].Set(text);
            row["tText"].CopyTo(outputRow["tRawText"]);
            outputRow["tTextFragments"].Set(row["tTextFragments"].String);
            outputRow["tPublishTime"].Set(TextFunctions.DateFormatFileUtc(TextFunctions.Strip(row["tCreatedAt"].ToString()), "0", 0.0));
            outputRow["tRetweetCount"].Set(row["tRetweetCount"].String);
            outputRow["tSpamScore"].Set(row["tSpamScore"].String);
            outputRow["uAuthScore"].Set(row["uAuthScore"].String);
            outputRow["uFavoritesCount"].Set(row["uFavoritesCount"].String);
            outputRow["uFollowersCount"].Set(row["uFollowersCount"].String);
            outputRow["uFriendsCount"].Set(row["uFriendsCount"].String);
            outputRow["uProfile"].Set(TextFunctions.generateProfile(row["uID"].String, row["uName"].String, row["uScreenName"].String, row["uProfilePage"].String, row["uProfileImageUrl"].String, row["uVerified"].Boolean));
            outputRow["uVerified"].Set(row["uVerified"].String);
            /////////////////////////new columns end/////////////////////////

            foreach (Entity entity in entityList)
            {
                outputRow["entity"].Set(entity.entity);
                outputRow["entityPrintable"].Set(entity.entityPrintable);
                outputRow["entitytype"].Set(entity.entitytype);
                outputRow["entitysubtypetype"].Set(entity.entitysubtypetype);
                outputRow["entitySource"].Set(entity.entitySource);//new column
                yield return(outputRow);
            }
        }
    }