STATISTICA







STATISTICA BASIC Program Outlyind.stb

{ Check for multivariate outliers

Dr. Stefan Funke, Braunschweig, Germany (funke@ibm.net)
07/1997
(nothing is guaranteed - use the program at your own risk)
Distances are calculated relative to group means determined by the current global selection criterion.
Requires Statistica/W version 5.1 or higher
The variables to be included can be freely selected. In addition, a variable can be specified to store an indicator when an outlier is detected, i.e. p<0.05. The code is:0=no outlier, 1=outlier. For obvious reasons this variable must not be included in the list of variables to be screened. The indicator variable will not be changed if 'cancel' is selected in the second variable specification dialog. }

RandomAccess;
NoDataFileVariableNames; 
 
ReDim varlist(NVars),VarList2(Nvars); 
ReDim buffer(NCases); 
res:=SelectVariables1 ('Check for  multivariate outliers', 1, NVars, VarList, Count, 
'Select variables'); 
if (res=0) then exit; 
res2:=SelectVariables1 ('Select variable to store outlier indicator to', 1, 1, VarList2, 
Count2, 'Select variable'); 
ReDim sums(NVars),ns(NVars),means(NVars),vmisscnt(NVars),cmisscnt(NCases); 
ReDim missvar(count,count); 
 
for i:=1 to NVars do 
	begin 
		sums(i):=0; ns(i):=0; means(i):=0; vmisscnt(i):=0; 
	end; 
 
nselected:=0; 
for i:=1 to NCases do 
	begin 
         if (SelectionConditions(i)>0) then nselected:=nselected+1; 
	end; 
 
ReDim mandist(nselected),eucdist(nselected),nummandist(nselected),numeucdist(nselected), 
      nummahadist(nselected); 
ReDim mahadist(nselected); 
ReDim mahadata(nselected,count),dmat(count,count),invdmat(count,count); 
 
 
for i:=1 to nselected do  
	begin 
        cmisscnt(i):=0; 
        mandist(i):=0; 
	  eucdist(i):=0; 
	end; 
vnamestr$:=''; 
for i:=1 to count do 
	begin 
            vnamestr$:=vnamestr$+varname(varlist(i))+'|'; 
		for j:=1 to count do 
			begin 
                        vi:=varlist(i); 
			      vj:=varlist(j); 
				for k:=1 to NCases do 
					begin 
				         if (SelectionConditions(k)=1) then 
						begin                                     
					        inval1:=data(k,vi); 
					        inval2:=data(k,vj); 
                                      nmiss1:=valid(inval1);	                
					        nmiss2:=valid(inval2); 
	                                if not(nmiss1*nmiss2=1) then  
							missvar(i,j):=missvar(i,j)+1; 
						end; 
 					end;			       
									 
			end; 
	end; 
 
mvnamestr$:=''; 
for i:=1 to count do 
	begin 
	  varnum:=varlist(i); 
        selcnt:=1; 
	  for j:=1 to NCases do 
		begin                
		   sel:=SelectionConditions (j); 
               inval:=data(j,varnum); 
               nonmiss:=valid(inval);    
                       
               if (sel*nonmiss=1) then 
						begin 
                                        sums(varnum):=sums(varnum)+inval; 
                                        ns(varnum):=ns(varnum)+1; 
						    mahadata(selcnt,i):=inval; 
                                        selcnt:=selcnt+1; 
                                        mvnamestr$:=mvnamestr$+Str (j,3,0)+'|'; 
						end; 
               if ((sel=1) and  (nonmiss=0)) then 
						begin 
							vmisscnt(varnum):=vmisscnt(varnum)+1; 
                                          cmisscnt(j):=cmisscnt(j)+1;			 
						end; 
		end; 
	end; 
 
for i:=1 to count do 
	begin 
   	   varnum:=varlist(i); 
         if (ns(varnum)>0) then means(varnum):=sums(varnum)/ns(varnum)  
         else means(varnum):=missing; 
	end; 
 
selcnt:=0; 
for i:=1 to NCases do 
	begin 
            manpart:=0; 
            eucpart:=0; 
            mansum:=0; 
            eucsum:=0; 
            missflag:=0; 
		for j:=1 to count do 
			begin 
                     varnum:=varlist(j); 
                     inval:=data(i,varnum); 
                     manpart:=abs(inval-means(varnum)); 
                     eucpart:=manpart*manpart; 
                     
                     if valid(inval) then 
				begin 
                          mansum:=mansum+manpart; 
		              eucsum:=eucsum+eucpart; 
				end 
                      else  
				begin 
                          mansum:=missing; 
				  eucsum:=missing; 
				end; 
			end; 
		sel:=SelectionConditions (i); 
            if (sel>0) then 
			begin 
			  selcnt:=selcnt+1;                 
                    nummandist(selcnt):=i; 
	              numeucdist(selcnt):=i;    
                    nummahadist(selcnt):=i; 
	              mandist(selcnt):=mansum; 
                    eucdist(selcnt):=sqrt(eucsum); 
                     
			end; 
	end; 
 
  VectorDualSort(mandist,nummandist, SORT_DESCENDING);  
  VectorDualSort(eucdist,numeucdist, SORT_DESCENDING);  
  ReDim manres(NCases,2),eucres(NCases,2),mahares(NCases,5); 
  for i:=1 to nselected do 
	begin 
	  manres(i,1):=nummandist(i); 
        manres(i,2):=mandist(i); 
	  eucres(i,1):=numeucdist(i); 
        eucres(i,2):=eucdist(i); 
	end;		 
   
 
colstr$:='Case no.|distance'; 
rowstr$:=''; 
NewScrollsheet (nselected, 2, manres, 'Manhattan dist.', rowstr$, colstr$); 
 
colstr$:='Case no.|distance'; 
rowstr$:=''; 
NewScrollsheet (nselected, 2, eucres, 'Euclidean dist.', rowstr$, colstr$); 
NewScrollsheet (count, count, missvar, 'Missing values between variables', vnamestr$, 
vnamestr$); 
 
MatrixCrossProductofDev(mahadata, 1, dmat); { Bortz 1, p. 643 } 
MatrixInverse(dmat,invdmat); 
 
ReDim meandiff(count,1),meandifftransp(1,count); 
Redim resmat1(1,count),resmat2(1,1),resmat3(1,1); 
 
n:=nselected; 
p:=count; 
for i:=1 to nselected do 
	begin 
         for j:=1 to count do 
		begin 
              meandiff(j,1):=mahadata(i,j)-means(varlist(j)); 
              meandifftransp(1,j):=mahadata(i,j)-means(varlist(j)); 
		end; 
            MatrixMultiply(meandifftransp,invdmat,resmat1); 
            Matrixmultiply(resmat1,meandiff,resmat2); 
            MatrixElemMultiply(resmat2, n-1, resmat3); 
            res:=resmat3(1,1); 
            mahadist(i):=res;  
	end; 
for i:=1 to nselected do 
	begin 
	  mahares(i,1):=nummahadist(i); 
        mahares(i,2):=mahadist(i); 
        f:=mahadist(i)*(n-p)/(p*(n-1)); 
        mahares(i,3):=f; 
        mahares(i,4):=1.0-iFDistr(f,p,n-p);	 
        if (mahares(i,4)<0.05) then mahares(i,5):=1 else mahares(i,5):=0; 
	end;		 
 NewScrollsheet (nselected, 4, mahares, 'Multivariate outliers report (unsorted)', 
rowstr$, 'case no.|T sq.|F|p'); 
 
if res2>0 then 
	begin 
        col:=VarList2(1); 
        for i:=1 to nselected do 
		begin 
              data(i,col):=mahares(i,5);                 
		end; 
	end; 
 
 VectorDualSort(mahadist,nummahadist, SORT_DESCENDING);  
 for i:=1 to nselected do 
	begin 
	  mahares(i,1):=nummahadist(i); 
        mahares(i,2):=mahadist(i); 
        f:=mahadist(i)*(n-p)/(p*(n-1)); 
        mahares(i,3):=f; 
        mahares(i,4):=1.0-iFDistr(f,p,n-p)	 
	end;		 
 NewScrollsheet (nselected, 4, mahares, 'Multivariate outliers report (sorted on T 
sq.)', rowstr$, 'case no.|T sq.|F|p'); 
 
NewIconGraph (ICON_SUNRAYS, 'Multivariate plot', '', 'icon labels are case numbers', 
count, nselected, mahadata, mvnamestr$, vnamestr$);
Back to List of Programs



[StatSoft]
2300 East 14th Street, Tulsa, OK 74104
Phone: (918) 749-1119; Fax: (918) 749-2217

[StatSoft]e-mail: info@statsoft.com

©Copyright StatSoft, Inc., 1984-2004.
StatSoft, StatSoft logo, STATISTICA, SEWSS, SEDAS, Data Miner, SEPATH and GTrees are trademarks of StatSoft, Inc.