static void Main(string[] args) { //Leer archivo string path = @"./data/"; string inputData = "corpusBillboard.csv"; string exitData = "corpusBillboardCaracterizado.csv"; HashSet <string> types = new HashSet <string>(); leerStopWords(); //crear vocabulario using (StreamReader sr = new StreamReader(path + inputData)) { while (sr.Peek() >= 0) { string line = sr.ReadLine(); string[] lineArray = line.Split(','); //descartamos letras en blanco o etiquetas en blanco if (lineArray[0] != "rank" | lineArray[4] != "" | lineArray[4] != "NA" | lineArray[6] != "") { string letra = lineArray[4]; letra = quitarStopWords(letra); //quitar stops words letra = letra.ToLower(); //minusculas var ngrams = NGramGenerator.generate(letra.ToArray(), 3, ""); //creamos trigramas dynamic ngramsArray = ngrams.toArray(); foreach (var token in ngramsArray) { types.Add(token); } } } } //caracterizar tweets using (StreamReader sr = new StreamReader(path + inputData)) { while (sr.Peek() >= 0) { string line = sr.ReadLine(); string[] lineArray = line.Split(','); //descartamos tuits tipo 3 if (lineArray[0] != "rank" | lineArray[4] != "" | lineArray[4] != "NA" | lineArray[6] != "") { string letra = lineArray[4]; letra = quitarStopWords(letra); //quitar stops words letra = letra.ToLower(); //minusculas var ngrams = NGramGenerator.generate(letra.ToArray(), 3, ""); //creamos trigramas dynamic ngramsArray = ngrams.toArray(); int[] caract = new int[types.Count]; Array.Clear(caract, 0, caract.Length); int count = 0; //for (int i = 0; i < types.Count; i++) foreach (var item in types) { for (int j = 0; j < ngramsArray.Length; j++) { if (item == ngramsArray[j]) { caract[count] = 1; } } count++; } //tweetsFinal.Add(tweetCaract); using (TextWriter tw = new StreamWriter(path + exitData, true)) { for (int i = 0; i < caract.Length; i++) { tw.Write(caract[i]); } tw.Write(" " + lineArray[6]); tw.WriteLine(); } } } } }
static void Main(string[] args) { //Leer archivo string path = @"./data/"; string inputData = "corpusTweets.csv"; string exitData = "corpusTweetsCaracterizado.csv"; HashSet <string> types = new HashSet <string>(); leerStopWords(); //crear vocabulario using (StreamReader sr = new StreamReader(path + inputData)) { while (sr.Peek() >= 0) { string line = sr.ReadLine(); string[] lineArray = line.Split(','); if (!(lineArray.Length < 3)) { //descartamos tuits tipo 3 if (lineArray[1] != "3") { if ((lineArray[1] != "" && lineArray[2] != "")) { string tweet = lineArray[0]; //tweet.Replace("|", ""); //quitar comas tweet.Replace("|", ","); //regresar comas tweet = quitarStopWords(tweet); //quitar stops words tweet = tweet.ToLower(); //minusculas var ngrams = NGramGenerator.generate(tweet.ToArray(), 3, ""); //creamos trigramas dynamic ngramsArray = ngrams.toArray(); foreach (var token in ngramsArray) { types.Add(token); } } } } } } //caracterizar tweets using (StreamReader sr = new StreamReader(path + inputData)) { while (sr.Peek() >= 0) { string line = sr.ReadLine(); string[] lineArray = line.Split(','); if (!(lineArray.Length < 3)) { //descartamos tuits tipo 3 if (lineArray[1] != "3") { if ((lineArray[1] != "" && lineArray[2] != "")) { string tweet = lineArray[0]; //tweet.Replace("|", ""); //quitar comas tweet.Replace("|", ","); //regresar comas tweet = quitarStopWords(tweet); //quitar stops words tweet = tweet.ToLower(); //minusculas var ngrams = NGramGenerator.generate(tweet.ToArray(), 3, ""); //creamos trigramas dynamic ngramsArray = ngrams.toArray(); int[] tweetCaract = new int[types.Count]; Array.Clear(tweetCaract, 0, tweetCaract.Length); int count = 0; //for (int i = 0; i < types.Count; i++) foreach (var item in types) { for (int j = 0; j < ngramsArray.Length; j++) { if (item == ngramsArray[j]) { tweetCaract[count] = 1; } } count++; } //tweetsFinal.Add(tweetCaract); //escribir en archivo int target; if (lineArray[2] == "amlo") { target = 0; } else if (lineArray[2] == "anaya") { target = 1; } else if (lineArray[2] == "meade") { target = 2; } else if (lineArray[2] == "bronco") { target = 3; } else if (lineArray[2] == "debate") { target = 4; } else { target = 5; } using (TextWriter tw = new StreamWriter(path + exitData, true)) { for (int i = 0; i < tweetCaract.Length; i++) { tw.Write(tweetCaract[i]); } tw.Write(" " + lineArray[1]); tw.WriteLine(); } } } } } } }