Scalable RESCAL Factorization

RDFTensor gives a scalable implementation of RESCAL tensor factorization which includes parallelization of steps and compact representation of slices. The following is a demonstration applied to dataset UMLS which is represented in 135 x 135 x 49 Tensor.

Calculate RESCAL Factorization

    tt=rescal(ntnsr$X,rnk=10,ainit='nvecs',verbose=1,lambdaA=0,epsilon=1e-4,lambdaR=0)
## [1] "Initializing A"
## [1] "Calculating  eigen vectors..."
## [1] "initialize R and Z..."
## [1] "-----------------------------iteration: 1 ----------------------------------"
## [1] "[  1] fit: 0.59644 | delta: 6.0e-01 | secs: 0.32000"
## [1] "-----------------------------iteration: 2 ----------------------------------"
## [1] "[  2] fit: 0.63090 | delta: 3.4e-02 | secs: 0.28000"
## [1] "-----------------------------iteration: 3 ----------------------------------"
## [1] "[  3] fit: 0.63930 | delta: 8.4e-03 | secs: 0.29000"
## [1] "-----------------------------iteration: 4 ----------------------------------"
## [1] "[  4] fit: 0.64224 | delta: 2.9e-03 | secs: 0.30000"
## [1] "-----------------------------iteration: 5 ----------------------------------"
## [1] "[  5] fit: 0.64349 | delta: 1.3e-03 | secs: 0.28000"
## [1] "-----------------------------iteration: 6 ----------------------------------"
## [1] "[  6] fit: 0.64414 | delta: 6.4e-04 | secs: 0.27000"
## [1] "-----------------------------iteration: 7 ----------------------------------"
## [1] "[  7] fit: 0.64452 | delta: 3.8e-04 | secs: 0.31000"
## [1] "-----------------------------iteration: 8 ----------------------------------"
## [1] "[  8] fit: 0.64477 | delta: 2.5e-04 | secs: 0.30000"
## [1] "-----------------------------iteration: 9 ----------------------------------"
## [1] "[  9] fit: 0.64495 | delta: 1.7e-04 | secs: 0.29000"
## [1] "-----------------------------iteration: 10 ----------------------------------"
## [1] "[ 10] fit: 0.64506 | delta: 1.2e-04 | secs: 0.30000"
## [1] "-----------------------------iteration: 11 ----------------------------------"
## [1] "[ 11] fit: 0.64514 | delta: 7.8e-05 | secs: 0.28000"
#tt=scRescal(ntnsr$X,rnk=10,ainit='nvecs',verbose=1,lambdaA=0,epsilon=1e-4,lambdaR=0,ncores = 2,OS_WIN = TRUE)
    A=tt$A
    R=tt$R

Calculate scores of triples

Use function rescal_Trp_Val to calculate scores of triples in the graph using the factorization obtained from previous step.

      res=rescal_Trp_Val(R=R,A=A,ntnsr,verbose=0)

    plot(density(res[,'val']),main='RESCAL Factorization rank=10, density of triples of UMLS')

plot of chunk unnamed-chunk-3

    print(summary(res[,'val']))
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -0.01384  0.28338  0.73350  0.64514  0.98184  1.70056

Reconstruct tensor

    RecRes=RescalReconstructBack(R=R,A=A,otnsr=ntnsr,ncore=2,verbose=0,OS_WIN=TRUE,generateLog=TRUE)
## [1] "NEnt=135, ChkLen=1000, cntChnks=1, cntGrp=1, grpLen=10"

calculate metrics

Calculate Recall (True positive rate), Precision and Harmonic mean.

    print(sprintf('True positive rate:%.2f %%',100*sum(RecRes$TP)/length(RecRes$TP)))
## [1] "True positive rate:77.19 %"
    s=2#<affects> predicate 
    stats=NULL
    ijk=RecRes[[1]]$ijk
    val=RecRes[[1]]$val
    tp_flg=RecRes$TP

    for(thr in sort(unique(val[tp_flg&ijk[,2]==s]),decreasing=TRUE)){
        tp=sum(tp_flg[val>=thr & ijk[,2]==s])
        fp=sum(val>=thr & ijk[,2]==s)-tp
        fn=sum(ntnsr$X[[s]])-tp
        stats=rbind(stats,cbind(thr=thr,R=tp/(tp+fn),P=tp/(tp+fp),tp=tp,fn=fn,fp=fp))
    }
    HM=apply(stats,1,function(x){2/(1/x['P']+1/x['R'])})

     plot(stats[,'thr'],stats[,'R']*100,type='l',col='red',lwd=2,
    main=sprintf('Slice:%d, Predicate:<%s>, #Triples:%d, Max HM @ %.4f',s,ntnsr$P[s],sum(ntnsr$X[[s]]),
     stats[which.max(HM),'thr']), ylab="",xlab='Threshold ',cex.main=0.85,
                     xlim=c(0,max(thr,1)),ylim=c(0,100))
    abline(h = c(0,20,40,60,80,100), lty = 2, col = "grey")
    abline(v = seq(0.1,1,0.1),  lty = 2, col = "grey")
    lines(stats[,'thr'],stats[,'P']*100,col='blue',lwd=2)
    lines(stats[,'thr'],100*HM,col='green',lwd=2)
    # grid(nx=10, lty = "dotted", lwd = 1)
    legend(legend=c('Recall','Precision','Harmonic mean'),col=c('red','blue','green'),x=0.6,y=20,pch=1,cex=0.75,lwd=2)
    abline(v=stats[which.max(HM),'thr'],col='grey')

plot of chunk unnamed-chunk-5