Re-implement textmodel_svmlim() without dependencies
1 |
#include "lib.h" |
|
2 |
#include <math.h> |
|
3 |
using namespace Rcpp; |
|
4 |
|
|
5 |
// [[Rcpp::export]]
|
|
6 | 1 |
Rcpp::List qatd_cpp_wordfish_dense(SEXP wfm, SEXP dir, SEXP priors, SEXP tol, SEXP disp, SEXP dispfloor, bool abs_err){ |
7 |
|
|
8 |
// DEFINE INPUTS
|
|
9 |
|
|
10 | 1 |
Rcpp::NumericMatrix Y(wfm); |
11 | 1 |
Rcpp::NumericVector priorvec(priors); |
12 | 1 |
Rcpp::NumericVector tolvec(tol); |
13 | 1 |
Rcpp::IntegerVector dirvec(dir); |
14 | 1 |
Rcpp::IntegerVector disptype(disp); |
15 | 1 |
Rcpp::NumericVector dispmin(dispfloor); |
16 |
|
|
17 | 1 |
double priorprecalpha = priorvec(0); |
18 | 1 |
double priorprecpsi = priorvec(1); |
19 | 1 |
double priorprecbeta = priorvec(2); |
20 | 1 |
double priorprectheta = priorvec(3); |
21 |
|
|
22 | 1 |
int N = Y.nrow(); |
23 | 1 |
int K = Y.ncol(); |
24 |
|
|
25 |
// SET INITIAL VALUES
|
|
26 | 1 |
Rcpp::NumericVector alpha(N); |
27 | 1 |
Rcpp::NumericVector psi(K); |
28 | 1 |
Rcpp::NumericVector beta(K); |
29 | 1 |
Rcpp::NumericVector theta(N); |
30 |
|
|
31 | 1 |
Rcpp::NumericVector thetaSE(N); // document position standard errors |
32 | 1 |
Rcpp::NumericVector phi(K,1.0); // word-level dispersion parameters |
33 |
|
|
34 |
// Construct Chi-Sq Residuals
|
|
35 | 1 |
arma::mat C(Y.begin(), N, K); |
36 | 1 |
arma::colvec rsum = sum(C,1); |
37 | 1 |
arma::rowvec csum = sum(C,0); |
38 | 1 |
double asum = sum(rsum); |
39 | 1 |
for (int i = 0; i < N; i++){ |
40 | 1 |
for (int k=0; k < K; k++){ |
41 | 1 |
C(i,k) = (Y(i,k) - rsum(i)*csum(k)/asum)/sqrt(rsum(i)*csum(k)/asum); |
42 |
}
|
|
43 |
}
|
|
44 |
|
|
45 |
// Singular Value Decomposition of Chi-Sq Residuals
|
|
46 | 1 |
arma::mat U(N,N); |
47 | 1 |
arma::vec s(N); |
48 | 1 |
arma::mat V(K,N); |
49 | 1 |
svd(U,s,V,C); |
50 |
|
|
51 |
// Load initial values
|
|
52 | 1 |
for (int i = 0; i < N; i++) { |
53 | 1 |
theta(i) = pow(rsum(i) / asum,-0.5) * U(i,0); |
54 |
//Rcout<<"theta starting values:"<<theta(i)<<std::endl;
|
|
55 |
}
|
|
56 | 1 |
for (int k=0; k < K; k++) beta(k) = 0; // pow(csum(k)/asum,-0.5) * V(k,0); |
57 | 1 |
for (int i=0; i < N; i++) alpha = log(rsum); |
58 | 1 |
psi = log(csum/N); |
59 |
|
|
60 | 1 |
alpha = alpha - log(mean(rsum)); |
61 | 1 |
theta = (theta - mean(theta)) / sd(theta); |
62 |
|
|
63 |
// Create temporary variables
|
|
64 | 1 |
Rcpp::NumericMatrix pars(2,1); |
65 | 1 |
Rcpp::NumericMatrix newpars(2,1); |
66 | 1 |
Rcpp::NumericMatrix G(2,1); |
67 | 1 |
Rcpp::NumericMatrix H(2,2); |
68 |
double loglambdaik; |
|
69 |
double mutmp; |
|
70 |
double phitmp; |
|
71 | 1 |
Rcpp::NumericVector lambdai(K); |
72 | 1 |
Rcpp::NumericVector lambdak(N); |
73 | 1 |
double stepsize = 1.0; |
74 | 1 |
double cc = 0.0; |
75 | 1 |
int inneriter = 0; |
76 | 1 |
int outeriter = 0; |
77 |
|
|
78 | 1 |
double lastlp = -2000000000000.0; |
79 | 1 |
double lp = -1.0 * (sum(0.5 * ((alpha*alpha) * (priorprecalpha))) + |
80 | 1 |
sum(0.5 * ((psi * psi) * (priorprecpsi))) + |
81 | 1 |
sum(0.5 * ((beta*beta)*(priorprecbeta))) + |
82 | 1 |
sum(0.5 * ((theta*theta)*(priorprectheta)))); |
83 |
|
|
84 | 1 |
for (int i = 0; i < N; i++){ |
85 | 1 |
for (int k=0; k < K; k++){ |
86 | 1 |
loglambdaik = alpha(i) + psi(k) + beta(k) * theta(i); |
87 | 1 |
lp = lp + loglambdaik * Y(i,k) - exp(loglambdaik); |
88 |
}
|
|
89 |
}
|
|
90 |
|
|
91 |
// BEGIN WHILE LOOP
|
|
92 | 1 |
double err = (abs_err == true) ? fabs(lp - lastlp) : (lp - lastlp); |
93 | 1 |
while ((err > tolvec(0)) && outeriter < 100) { |
94 | 1 |
outeriter++; |
95 |
|
|
96 |
// UPDATE WORD PARAMETERS
|
|
97 | 1 |
for (int k = 0; k < K; k++) { |
98 | 1 |
cc = 1; |
99 | 1 |
inneriter = 0; |
100 | 1 |
if (outeriter == 1) stepsize = 0.5; |
101 | 1 |
while ((cc > tolvec(1)) && inneriter < 10){ |
102 | 1 |
inneriter++; |
103 | 1 |
lambdak = exp(alpha + psi(k) + beta(k) * theta); |
104 | 1 |
G(0,0) = sum(Y(_,k) - lambdak) / phi(k) - psi(k) * (priorprecpsi); |
105 | 1 |
G(1,0) = sum(theta * (Y(_,k) - lambdak)) / phi(k) - beta(k) * (priorprecbeta); |
106 | 1 |
H(0,0) = -sum(lambdak) / phi(k) - priorprecpsi; |
107 | 1 |
H(1,0) = -sum(theta * lambdak)/phi(k); |
108 | 1 |
H(0,1) = H(1,0); |
109 | 1 |
H(1,1) = -sum((theta * theta) * lambdak) / phi(k) - priorprecbeta; |
110 | 1 |
pars(0,0) = psi(k); |
111 | 1 |
pars(1,0) = beta(k); |
112 | 1 |
newpars(0,0) = pars(0,0) - stepsize * (H(1,1) * G(0,0) - H(0,1) * G(1,0)) / (H(0,0) * H(1,1) - H(0,1) * H(1,0)); |
113 | 1 |
newpars(1,0) = pars(1,0) - stepsize * (H(0,0) * G(1,0) - H(1,0) * G(0,0)) / (H(0,0) * H(1,1) - H(0,1) * H(1,0)); |
114 | 1 |
psi(k) = newpars(0,0); |
115 | 1 |
beta(k) = newpars(1,0); |
116 | 1 |
cc = max(abs(newpars - pars)); |
117 | 1 |
stepsize = 1.0; |
118 |
}
|
|
119 |
}
|
|
120 |
|
|
121 |
|
|
122 |
// UPDATE DOCUMENT PARAMETERS
|
|
123 | 1 |
for (int i = 0; i < N; i++){ |
124 | 1 |
cc = 1; |
125 | 1 |
inneriter = 0; |
126 | 1 |
if (outeriter == 1) stepsize = 0.5; |
127 | 1 |
while ((cc > tolvec(1)) && inneriter < 10){ |
128 | 1 |
inneriter++; |
129 | 1 |
lambdai = exp(alpha(i) + psi + beta * theta(i)); |
130 | 1 |
G(0,0) = sum((Y(i,_) - lambdai) / phi) - alpha(i) * priorprecalpha; |
131 | 1 |
G(1,0) = sum((beta * (Y(i,_) - lambdai)) / phi) - theta(i) * priorprectheta; |
132 | 1 |
H(0,0) = -sum(lambdai/phi) - priorprecalpha; |
133 | 1 |
H(1,0) = -sum((beta * lambdai) / phi); |
134 | 1 |
H(0,1) = H(1,0); |
135 | 1 |
H(1,1) = -sum(((beta * beta) * lambdai) / phi) - priorprectheta; |
136 | 1 |
pars(0,0) = alpha(i); |
137 | 1 |
pars(1,0) = theta(i); |
138 | 1 |
newpars(0,0) = pars(0,0) - stepsize * (H(1,1) * G(0,0) - H(0,1) * G(1,0)) / (H(0,0) * H(1,1) - H(0,1) * H(1,0)); |
139 | 1 |
newpars(1,0) = pars(1,0) - stepsize * (H(0,0) * G(1,0) - H(1,0) * G(0,0)) / (H(0,0) * H(1,1) - H(0,1) * H(1,0)); |
140 | 1 |
alpha(i) = newpars(0,0); |
141 | 1 |
theta(i) = newpars(1,0); |
142 | 1 |
cc = max(abs(newpars - pars)); |
143 | 1 |
stepsize = 1.0; |
144 |
}
|
|
145 |
}
|
|
146 |
|
|
147 |
// UPDATE DISPERSION PARAMETERS
|
|
148 |
|
|
149 | 1 |
if (disptype(0) == 2) { // single dispersion parameter for all words |
150 | 1 |
phitmp = 0.0; |
151 | 1 |
for (int k = 0; k < K; k++){ |
152 | 1 |
for (int i=0; i < N; i++){ |
153 | 1 |
mutmp = exp(alpha(i) + psi(k) + beta(k) * theta(i)); |
154 | 1 |
phitmp = phitmp + (Y(i,k) - mutmp) * (Y(i,k) - mutmp) / mutmp; |
155 |
}
|
|
156 |
}
|
|
157 | 1 |
phitmp = phitmp / (N * K - 2 * N - 2 * K); |
158 | 1 |
for (int k = 0; k < K; k++) phi(k) = phitmp; |
159 |
}
|
|
160 |
|
|
161 | 1 |
if (disptype(0) >= 3) { // individual dispersion parameter for each word |
162 | 1 |
for (int k = 0; k < K; k++){ |
163 | 1 |
phitmp = 0.0; |
164 | 1 |
for (int i = 0; i < N; i++){ |
165 | 1 |
mutmp = exp(alpha(i) + psi(k) + beta(k) * theta(i)); |
166 | 1 |
phitmp = phitmp + (Y(i,k) - mutmp) * (Y(i,k) - mutmp) / mutmp; |
167 |
}
|
|
168 | 1 |
phitmp = (K * phitmp) / (N * K - 2 * N - 2 * K); |
169 | 1 |
phi(k) = phitmp; |
170 |
// set ceiling on underdispersion
|
|
171 | 1 |
if (disptype(0) == 4) phi(k) = fmax(dispmin(0), phi(k)); |
172 |
}
|
|
173 |
}
|
|
174 |
|
|
175 | 1 |
alpha = alpha - mean(alpha); |
176 | 1 |
theta = (theta - mean(theta))/sd(theta); |
177 |
|
|
178 |
// CHECK LOG-POSTERIOR FOR CONVERGENCE
|
|
179 | 1 |
lastlp = lp; |
180 | 1 |
lp = -1.0 * (sum(0.5 * ((alpha*alpha) * (priorprecalpha))) + sum(0.5 * ((psi*psi) * (priorprecpsi))) + sum(0.5 * ((beta*beta)*(priorprecbeta))) + sum(0.5 * ((theta*theta)*(priorprectheta)))); |
181 | 1 |
for (int i = 0; i < N; i++){ |
182 | 1 |
for (int k = 0; k < K; k++){ |
183 | 1 |
loglambdaik = alpha(i) + psi(k) + beta(k) * theta(i); |
184 | 1 |
lp = lp + loglambdaik * Y(i,k) - exp(loglambdaik); |
185 |
}
|
|
186 |
}
|
|
187 |
// Rprintf("%d: %f2\\n",outeriter,lp);
|
|
188 |
//Rcout<<"outeriter="<<outeriter<<" lp - lastlp= "<<lp - lastlp<<std::endl;
|
|
189 | 1 |
err = (abs_err == true) ? fabs(lp - lastlp) : (lp - lastlp); |
190 |
// END WHILE LOOP
|
|
191 |
}
|
|
192 |
|
|
193 |
|
|
194 |
// Fix Global Polarity
|
|
195 |
|
|
196 |
// added the -1 because C counts from ZERO... -- KB
|
|
197 | 1 |
if (theta(dirvec(0) - 1) > theta(dirvec(1) - 1)) { |
198 | 1 |
beta = -beta; |
199 | 1 |
theta = -theta; |
200 |
}
|
|
201 |
|
|
202 |
// COMPUTE DOCUMENT STANDARD ERRORS
|
|
203 | 1 |
for (int i = 0; i < N; i++) { |
204 | 1 |
lambdai = exp(alpha(i) + psi + beta*theta(i)); |
205 | 1 |
H(0,0) = -sum(lambdai / phi) - priorprecalpha; |
206 | 1 |
H(1,0) = -sum((beta * lambdai) / phi); |
207 | 1 |
H(0,1) = H(1,0); |
208 | 1 |
H(1,1) = -sum(((beta* beta) * lambdai)/phi) - priorprectheta; |
209 | 1 |
thetaSE(i) = sqrt(-1.0 * H(0,0) / (H(0,0) * H(1,1) - H(1,0) * H(0,1))); |
210 |
}
|
|
211 |
|
|
212 |
// DEFINE OUTPUT
|
|
213 |
|
|
214 | 1 |
return Rcpp::List::create(Rcpp::Named("theta") = theta, |
215 | 1 |
Rcpp::Named("alpha") = alpha, |
216 | 1 |
Rcpp::Named("psi") = psi, |
217 | 1 |
Rcpp::Named("beta") = beta, |
218 | 1 |
Rcpp::Named("phi") = phi, |
219 | 1 |
Rcpp::Named("thetaSE") = thetaSE); |
220 |
|
|
221 |
}
|
Read our documentation on viewing source code .