lucasgodeiro / TextForecast

@@ -1,75 +1,3 @@
Loading
1 -
############Forecasting S&P return based on news
2 -
# ticker = stock ticker
3 -
# start_date= start of sample
4 -
# end_date= end of sample
5 -
# start_year= star of news year
6 -
#  end_year = end of news year
7 -
# quant_fin=  initial quantile
8 -
# quant_end = last quantile
9 -
# by_quant= by quantile
10 -
# newstype if 100 news 0, if 10 news 1
11 -
#library(udpipe)
12 -
#library(tseries)
13 -
#library(fGarch)
14 -
#library(caret)
15 -
#library(zoo)
16 -
#library(aod)
17 -
#library(EnvStats)
18 -
#library(stats)
19 -
#library(forecast)
20 -
#library(vars)
21 -
#library(tsDyn)
22 -
#library(dplyr)
23 -
#library(quantreg)
24 -
#library(lars)
25 -
#library(lsa)
26 -
#library(pdftools)
27 -
#library(plyr)
28 -
#library(class)
29 -
#library(tm)
30 -
#library(wordcloud)
31 -
#library(RWeka)
32 -
#library(SnowballC)
33 -
#library(caret)
34 -
#library(rminer)
35 -
#library(kernlab)
36 -
#library(rpart)
37 -
#library(quanteda)
38 -
#library(tidyr)
39 -
#library(text2vec)
40 -
#library(glmnet)
41 -
#library(ggplot2)
42 -
#library(tau)
43 -
#library(dplyr)
44 -
#library(tidytext)
45 -
#library(janeaustenr)
46 -
#library(tidyr)
47 -
#library(dplyr)
48 -
49 -
50 -
51 -
52 -
#dates:  a character vector indicating the subfolders dates.  ##############
53 -
54 -
#ntrms: maximum numbers of words  that will be filtered by tf-idf. We rank the word
55 -
# by tf-idf in a decreasing order. Then, after we select the words with the ntrms highst tf-idf.
56 -
57 -
#st: set 0 to stem the words and 1 otherwise.
58 -
59 -
#path_name: the folders path where the subfolders with the dates are located.
60 -
61 -
62 -
63 -
################Example ##############################################################
64 -
#st_year=2000
65 -
#end_year=2017
66 -
#qt=paste0(sort(rep(seq(from=st_year,to=end_year,by=1),12)),c("m1","m2","m3","m4","m5","m6","m7","m8","m9","m10","m11","m12"))
67 -
#path_name="C:/Users/Lucas Godeiro/Dropbox/SP/NY_WSJ/Data_all"
68 -
#z_wrd1=get_words(corpus_dates=qt[1:t],path_name=path_name,ntrms=500,st=0)
69 -
################Example ##############################################################
70 -
71 -
#returns a list containing  a matrix with the all words couting and another with a td-idf filtered words couting according to the ntrms.
72 -
73 1
#' get_words function
74 2
#'
75 3
#' @param corpus_dates A vector of characters indicating the subfolders where are located the texts.
@@ -78,18 +6,14 @@
Loading
78 6
#' @param path_name the folders path where the subfolders with the dates are located.
79 7
#' @param language The texts language.
80 8
#'
81 -
#' @return a list containing  a matrix with the all words couting and another with a td-idf filtered words counting according to the ntrms.
9 +
#' @return a list containing  a sparse matrix with the all words couting and another with a td-idf filtered words counting according to the ntrms.
82 10
#' @import tm
83 -
#' @import wordcloud
84 -
#' @import SnowballC
85 -
#' @import rpart
86 -
#' @import tidyr
87 -
#' @import text2vec
88 -
#' @import tidytext
89 11
#' @import pdftools
90 12
#' @importFrom dplyr tbl_df
91 13
#' @importFrom plyr rbind.fill
92 14
#' @importFrom stats aggregate
15 +
#' @importFrom Matrix Matrix
16 +
#'
93 17
#' @export
94 18
#'
95 19
#' @examples
@@ -120,35 +44,35 @@
Loading
120 44
121 45
  if(st==0) {
122 46
    cleancorpus <- function(corpus) {
123 -
      corpus.tmp <-  tm_map(corpus,removePunctuation)
124 -
      corpus.tmp1 <- tm_map(corpus.tmp,content_transformer(tolower))
125 -
      corpus.tmp2 <- tm_map(corpus.tmp1,stripWhitespace)
126 -
      corpus.tmp3 <- tm_map(corpus.tmp2,removeNumbers)
127 -
      corpus.tmp4 <- tm_map(corpus.tmp3,removeWords,c(tidytext::stop_words$word))
128 -
      corpus.tmp5 <- tm_map(corpus.tmp4,removeWords,stopwords(language))
129 -
      corpus.tmp6 <- tm_map(corpus.tmp5, stemDocument, language = language)
47 +
      corpus.tmp <-  tm::tm_map(corpus,removePunctuation)
48 +
      corpus.tmp1 <- tm::tm_map(corpus.tmp,content_transformer(tolower))
49 +
      corpus.tmp2 <- tm::tm_map(corpus.tmp1,stripWhitespace)
50 +
      corpus.tmp3 <- tm::tm_map(corpus.tmp2,removeNumbers)
51 +
      corpus.tmp4 <- tm::tm_map(corpus.tmp3,removeWords,c(tidytext::stop_words$word))
52 +
      corpus.tmp5 <- tm::tm_map(corpus.tmp4,removeWords,stopwords(language))
53 +
      corpus.tmp6 <- tm::tm_map(corpus.tmp5, stemDocument, language = language)
130 54
      return(corpus.tmp6)
131 55
    }
132 56
  } else {
133 57
    cleancorpus <- function(corpus) {
134 -
      corpus.tmp <-  tm_map(corpus,removePunctuation)
135 -
      corpus.tmp1 <- tm_map(corpus.tmp,content_transformer(tolower))
136 -
      corpus.tmp2 <- tm_map(corpus.tmp1,stripWhitespace)
137 -
      corpus.tmp3 <- tm_map(corpus.tmp2,removeNumbers)
138 -
      corpus.tmp4 <- tm_map(corpus.tmp3,removeWords,c(stop_words$word))
58 +
      corpus.tmp <-  tm::tm_map(corpus,removePunctuation)
59 +
      corpus.tmp1 <- tm::tm_map(corpus.tmp,content_transformer(tolower))
60 +
      corpus.tmp2 <- tm::tm_map(corpus.tmp1,stripWhitespace)
61 +
      corpus.tmp3 <- tm::tm_map(corpus.tmp2,removeNumbers)
62 +
      corpus.tmp4 <- tm::tm_map(corpus.tmp3,removeWords,c(stop_words$word))
139 63
      corpus.tmp5 <- tm_map(corpus.tmp4,removeWords,stopwords("english"))
140 64
      return(corpus.tmp5)
141 65
    }
142 66
  }
143 67
144 -
  Rpdf <- readPDF(control = list(text = "-layout"))
68 +
  Rpdf <- tm::readPDF(control = list(text = "-layout"))
145 69
146 70
147 71
  generateTDM <- function(cand,path) {
148 72
    s.dir <- sprintf("%s/%s",path,cand)
149 -
    s.cor <- Corpus(DirSource(directory=s.dir,encoding = "UTF-8"), readerControl=list(reader = Rpdf) )
73 +
    s.cor <- tm::Corpus(DirSource(directory=s.dir,encoding = "UTF-8"), readerControl=list(reader = Rpdf) )
150 74
    s.cor.cl <- cleancorpus(s.cor)
151 -
    s.tdm <- TermDocumentMatrix(s.cor.cl)
75 +
    s.tdm <- tm::TermDocumentMatrix(s.cor.cl)
152 76
    #s.tdm <- removeSparseTerms(s.tdm,spar)
153 77
    #s.tdm=as.matrix(s.tdm)
154 78
    result <- list(name=cand,tdm=s.tdm)
@@ -184,61 +108,61 @@
Loading
184 108
185 109
186 110
  tfidf<- function(x) {
187 -
    xx=as.matrix(x)
188 -
    ndoc=nrow(x)
111 +
    xx <- as.matrix(x)
112 +
    ndoc <- nrow(x)
189 113
190 114
    II = xx>0
191 -
    II_sum = apply(II,2,FUN=sum)
192 -
    nct=II_sum
193 -
    idf=log(ndoc/nct)
115 +
    II_sum <- apply(II,2,FUN=sum)
116 +
    nct <- II_sum
117 +
    idf <- log(ndoc/nct)
194 118
195 -
    xx_tfidf = matrix(NA,nrow=nrow(x),ncol=ncol(x))
119 +
    xx_tfidf <- matrix(NA,nrow=nrow(x),ncol=ncol(x))
196 120
    #xx_log=log(1+xx)
197 121
    for (i in 1:ncol(x)) {
198 122
      xx_tfidf[,i] =idf[i]  * xx[,i]
199 123
    }
200 -
    xx_tfidf_sum = apply(xx_tfidf,2,FUN=mean)
124 +
    xx_tfidf_sum <- apply(xx_tfidf,2,FUN=mean)
201 125
    return(xx_tfidf)
202 126
  }
203 127
204 128
205 129
  tfidfsum<- function(x) {
206 -
    xx=as.matrix(x)
207 -
    ndoc=nrow(x)
130 +
    xx <- as.matrix(x)
131 +
    ndoc <- nrow(x)
208 132
209 -
    II = xx>0
210 -
    II_sum = apply(II,2,FUN=sum)
211 -
    nct=II_sum
212 -
    idf=log(ndoc/nct)
133 +
    II <- xx>0
134 +
    II_sum <- apply(II,2,FUN=sum)
135 +
    nct <- II_sum
136 +
    idf <- log(ndoc/nct)
213 137
214 138
    #xx_log=log(1+xx)
215 139
216 -
    xx_tfidf = matrix(NA,nrow=nrow(x),ncol=ncol(x))
140 +
    xx_tfidf <- matrix(NA,nrow=nrow(x),ncol=ncol(x))
217 141
    #xx_log=log(1+xx)
218 142
    for (i in 1:ncol(x)) {
219 -
      xx_tfidf[,i] =idf[i]*xx[,i]
143 +
      xx_tfidf[,i] <- idf[i]*xx[,i]
220 144
    }
221 145
222 -
    xx_tfidf_sum = apply(xx_tfidf,2,FUN=mean)
146 +
    xx_tfidf_sum <- apply(xx_tfidf,2,FUN=mean)
223 147
    xx_tfidf_sum[is.nan(xx_tfidf_sum)]=0
224 148
    return(xx_tfidf_sum)
225 149
  }
226 150
227 151
228 152
  tfidfsum1<- function(x) {
229 -
    xx=as.matrix(x)
230 -
    ndoc=nrow(x)
153 +
    xx <- as.matrix(x)
154 +
    ndoc <- nrow(x)
231 155
232 -
    II = xx>0
233 -
    II_sum = apply(II,2,FUN=sum)
234 -
    nct=II_sum
235 -
    idf=log(ndoc/nct)
156 +
    II <- xx>0
157 +
    II_sum <- apply(II,2,FUN=sum)
158 +
    nct <- II_sum
159 +
    idf <- log(ndoc/nct)
236 160
237 161
238 162
239 -
    tf=apply(xx,2,FUN=sum)
163 +
    tf <- apply(xx,2,FUN=sum)
240 164
241 -
    xx_tfidf_sum=tf*idf
165 +
    xx_tfidf_sum <- tf*idf
242 166
    xx_tfidf_sum[is.nan(xx_tfidf_sum)]=0
243 167
244 168
    return(xx_tfidf_sum)
@@ -253,8 +177,11 @@
Loading
253 177
    II=m_data>=meanfilter
254 178
    data_words1=as.matrix(data_words[,II])
255 179
  } else {
256 -
    data_words1=as.matrix(data_words)
180 +
    data_words1 <- as.matrix(data_words)
257 181
  }
182 +
183 +
  data_words <-  Matrix::Matrix(data_words, sparse = TRUE)
184 +
  data_words1 <-  Matrix::Matrix(data_words1, sparse = TRUE)
258 185
  data_lst = list(data_words,data_words1)
259 186
  return(data_lst)
260 187
}

@@ -13,7 +13,6 @@
Loading
13 13
#' @import ggplot2
14 14
#' @import tidytext
15 15
#' @import forcats
16 -
#' @import tidytext
17 16
#' @importFrom dplyr tibble
18 17
#' @importFrom dplyr filter
19 18
#' @importFrom dplyr group_by
@@ -26,6 +25,7 @@
Loading
26 25
#'
27 26
#'
28 27
#' @examples
28 +
#' suppressWarnings(RNGversion("3.5.0"))
29 29
#' set.seed(1)
30 30
#' data("stock_data")
31 31
#' data("news_data")
@@ -34,7 +34,7 @@
Loading
34 34
#' data("news_data")
35 35
#' X=news_data[,2:ncol(news_data)]
36 36
#' x=as.matrix(X)
37 -
#' grid_alphas=0.25
37 +
#' grid_alphas=0.05
38 38
#' cont_folds=TRUE
39 39
#' t=length(y)
40 40
#' optimal_alphas=optimal_alphas(x[1:(t-1),],w[1:(t-1),],

@@ -7,19 +7,15 @@
Loading
7 7
#' @param min_freq integer indicating the frequency of how many times a collocation should at least occur in the data in order to be returned.
8 8
#' @param language the texts language. Default is english.
9 9
#'
10 -
#' @return a list containing  a matrix with the all collocations couting and another with a tf-idf filtered collocations counting according to the ntrms.
10 +
#' @return a list containing  a sparse matrix with the all collocations couting and another with a tf-idf filtered collocations counting according to the ntrms.
11 11
#' @import udpipe
12 12
#' @import tm
13 13
#' @import pdftools
14 -
#' @import SnowballC
15 -
#' @import rpart
16 14
#' @import tidytext
17 -
#' @import text2vec
18 -
#' @import class
19 -
#' @import rpart
20 15
#' @importFrom dplyr tbl_df
21 16
#' @importFrom plyr rbind.fill
22 17
#' @importFrom stats aggregate
18 +
#' @importFrom Matrix Matrix
23 19
#' @export
24 20
#'
25 21
#' @examples
@@ -96,8 +92,8 @@
Loading
96 92
    s.tdm_m=base::as.matrix(s.tdm)
97 93
    s.tdm_sum=apply(s.tdm_m,1,FUN = sum)
98 94
    s.tdm.sum_rk=sort(s.tdm_sum,decreasing = TRUE)
99 -
    if(length(s.tdm_sum)>50){
100 -
      s=s.tdm.sum_rk[50]
95 +
    if(length(s.tdm_sum)>150){
96 +
      s=s.tdm.sum_rk[150]
101 97
    } else {
102 98
      s=0
103 99
    }
@@ -162,15 +158,19 @@
Loading
162 158
163 159
164 160
  if(ncol(data_words)>ntrms) {
165 -
    m_data=tfidfsum1(data_words)
166 -
    m_data_srt=sort(m_data,decreasing=TRUE)
167 -
    meanfilter=m_data_srt[ntrms]
161 +
    m_data <- tfidfsum1(data_words)
162 +
    m_data_srt <- sort(m_data,decreasing=TRUE)
163 +
    meanfilter <- m_data_srt[ntrms]
168 164
    II=m_data>=meanfilter
169 -
    data_words1=as.matrix(data_words[,II])
165 +
    data_words1 <- as.matrix(data_words[,II])
170 166
  } else {
171 -
    data_words1=as.matrix(data_words)
167 +
    data_words1 <- as.matrix(data_words)
172 168
  }
173 -
  data_lst = list(data_words,data_words1)
169 +
170 +
  data_words <-  Matrix::Matrix(data_words, sparse = TRUE)
171 +
  data_words1 <-  Matrix::Matrix(data_words1, sparse = TRUE)
172 +
  data_lst <- list(data_words,data_words1)
173 +
174 174
  return(data_lst)
175 175
176 176

@@ -1,4 +1,4 @@
Loading
1 -
#' Title
1 +
#' get_terms function
2 2
#'
3 3
#' @param corpus_dates a character vector indicating the subfolders where the texts are located.
4 4
#' @param ntrms_words maximum numbers of words  that will be filtered by tf-idf. We rank the word by tf-idf in a decreasing order. Then, we select the words with the ntrms highest tf-idf.
@@ -9,7 +9,7 @@
Loading
9 9
#' @param min_freq integer indicating the frequency of how many times a collocation should at least occur in the data in order to be returned.
10 10
#' @param language the texts language. Default is english.
11 11
#'
12 -
#' @return a list containing  a matrix with the all collocations and words couting and another with a td-idf filtered collocations and words counting according to the ntrms.
12 +
#' @return a list containing  a sparse matrix with the all collocations and words couting and another with a tf-idf filtered collocations and words counting according to the ntrms.
13 13
#' @export
14 14
#'
15 15
#' @examples
@@ -40,11 +40,11 @@
Loading
40 40
  z_wrd=get_words(corpus_dates=corpus_dates,ntrms=ntrms_words,st=st,path_name=path.name,language=language)
41 41
  z_coll=get_collocations(corpus_dates=corpus_dates,path_name=path.name,ntrms=ntrms_collocation,ngrams_number=ngrams_number,min_freq = min_freq,language=language)
42 42
43 -
  z_full=cbind(z_wrd[[1]],z_coll[[1]])
44 -
  z_full=as.matrix(z_full)
45 -
  z_tf=cbind(z_wrd[[2]],z_coll[[2]])
46 -
  z_tf=as.matrix(z_tf)
47 -
  results = list(z_full,z_tf)
43 +
  z_full <- cbind(z_wrd[[1]],z_coll[[1]])
44 +
  #z_full <- as.matrix(z_full)
45 +
  z_tf <- cbind(z_wrd[[2]],z_coll[[2]])
46 +
  #z_tf <- as.matrix(z_tf)
47 +
  results <- list(z_full,z_tf)
48 48
  return(results)
49 49
}
50 50

@@ -18,6 +18,7 @@
Loading
18 18
#' @export
19 19
#'
20 20
#' @examples
21 +
#' suppressWarnings(RNGversion("3.5.0"))
21 22
#' set.seed(1)
22 23
#' data("stock_data")
23 24
#' data("news_data")
@@ -26,12 +27,13 @@
Loading
26 27
#' data("news_data")
27 28
#' X=news_data[,2:ncol(news_data)]
28 29
#' x=as.matrix(X)
29 -
#' grid_alphas=0.15
30 +
#' grid_alphas=0.05
30 31
#' cont_folds=TRUE
31 32
#' t=length(y)
32 33
#' optimal_alphas=optimal_alphas(x=x[1:(t-1),],
33 34
#'                               y=y[2:t],grid_alphas=grid_alphas,cont_folds=TRUE,family="gaussian")
34 -
#' tv_idx=tv_sentiment_index_all_coefs(x=x[1:(t-1),],y=y[2:t],alpha = optimal_alphas[1],lambda = optimal_alphas[2],newx=x,
35 +
#' tv_idx=tv_sentiment_index_all_coefs(x=x[1:(t-1),],y=y[2:t],alpha = optimal_alphas[1],
36 +
#'                                  lambda = optimal_alphas[2],newx=x,
35 37
#'                                  scaled = TRUE,k_mov_avg = 4,type_mov_avg = "s")
36 38
37 39
tv_sentiment_index_all_coefs <- function(x,w,y, alpha, lambda,newx,family,scaled,k_mov_avg,type_mov_avg){

@@ -8,10 +8,11 @@
Loading
8 8
#' @param newx Matrix  that selection will applied. Useful for time series, when we need the observation at time t.
9 9
#' @param family the glmnet family.
10 10
#'
11 -
#' @return X_star: a list with the coefficients and a matrix with the most predictive terms.
11 +
#' @return X_star: a list with the coefficients and a sparse matrix with the most predictive terms.
12 12
#'
13 13
#' @import glmnet
14 14
#' @importFrom stats coef
15 +
#' @importFrom Matrix Matrix
15 16
#' @export
16 17
#'
17 18
#' @examples
@@ -95,8 +96,9 @@
Loading
95 96
96 97
97 98
  }
98 -
  coef_est=coef(eq)
99 -
  sx=as.matrix(subset(newx,select=II2))
100 -
  results=list(sx,coef_est)
99 +
  coef_est <- coef(eq)
100 +
  sx <-  as.matrix(subset(newx,select=II2))
101 +
  sx <-  Matrix::Matrix(sx, sparse = TRUE)
102 +
  results <- list(sx,coef_est)
101 103
  return(results)
102 104
}
Files Coverage
R 62.19%
Project Totals (13 files) 62.19%
Sunburst
The inner-most circle is the entire project, moving away from the center are folders then, finally, a single file. The size and color of each slice is representing the number of statements and the coverage, respectively.
Icicle
The top section represents the entire project. Proceeding with folders and finally individual files. The size and color of each slice is representing the number of statements and the coverage, respectively.
Grid
Each block represents a single file in the project. The size and color of each block is represented by the number of statements and the coverage, respectively.
Loading