RandomAccess;
NoDataFileVariableNames;
ReDim varlist(NVars),VarList2(Nvars);
ReDim buffer(NCases);
res:=SelectVariables1 ('Check for multivariate outliers', 1, NVars, VarList, Count,
'Select variables');
if (res=0) then exit;
res2:=SelectVariables1 ('Select variable to store outlier indicator to', 1, 1, VarList2,
Count2, 'Select variable');
ReDim sums(NVars),ns(NVars),means(NVars),vmisscnt(NVars),cmisscnt(NCases);
ReDim missvar(count,count);
for i:=1 to NVars do
begin
sums(i):=0; ns(i):=0; means(i):=0; vmisscnt(i):=0;
end;
nselected:=0;
for i:=1 to NCases do
begin
if (SelectionConditions(i)>0) then nselected:=nselected+1;
end;
ReDim mandist(nselected),eucdist(nselected),nummandist(nselected),numeucdist(nselected),
nummahadist(nselected);
ReDim mahadist(nselected);
ReDim mahadata(nselected,count),dmat(count,count),invdmat(count,count);
for i:=1 to nselected do
begin
cmisscnt(i):=0;
mandist(i):=0;
eucdist(i):=0;
end;
vnamestr$:='';
for i:=1 to count do
begin
vnamestr$:=vnamestr$+varname(varlist(i))+'|';
for j:=1 to count do
begin
vi:=varlist(i);
vj:=varlist(j);
for k:=1 to NCases do
begin
if (SelectionConditions(k)=1) then
begin
inval1:=data(k,vi);
inval2:=data(k,vj);
nmiss1:=valid(inval1);
nmiss2:=valid(inval2);
if not(nmiss1*nmiss2=1) then
missvar(i,j):=missvar(i,j)+1;
end;
end;
end;
end;
mvnamestr$:='';
for i:=1 to count do
begin
varnum:=varlist(i);
selcnt:=1;
for j:=1 to NCases do
begin
sel:=SelectionConditions (j);
inval:=data(j,varnum);
nonmiss:=valid(inval);
if (sel*nonmiss=1) then
begin
sums(varnum):=sums(varnum)+inval;
ns(varnum):=ns(varnum)+1;
mahadata(selcnt,i):=inval;
selcnt:=selcnt+1;
mvnamestr$:=mvnamestr$+Str (j,3,0)+'|';
end;
if ((sel=1) and (nonmiss=0)) then
begin
vmisscnt(varnum):=vmisscnt(varnum)+1;
cmisscnt(j):=cmisscnt(j)+1;
end;
end;
end;
for i:=1 to count do
begin
varnum:=varlist(i);
if (ns(varnum)>0) then means(varnum):=sums(varnum)/ns(varnum)
else means(varnum):=missing;
end;
selcnt:=0;
for i:=1 to NCases do
begin
manpart:=0;
eucpart:=0;
mansum:=0;
eucsum:=0;
missflag:=0;
for j:=1 to count do
begin
varnum:=varlist(j);
inval:=data(i,varnum);
manpart:=abs(inval-means(varnum));
eucpart:=manpart*manpart;
if valid(inval) then
begin
mansum:=mansum+manpart;
eucsum:=eucsum+eucpart;
end
else
begin
mansum:=missing;
eucsum:=missing;
end;
end;
sel:=SelectionConditions (i);
if (sel>0) then
begin
selcnt:=selcnt+1;
nummandist(selcnt):=i;
numeucdist(selcnt):=i;
nummahadist(selcnt):=i;
mandist(selcnt):=mansum;
eucdist(selcnt):=sqrt(eucsum);
end;
end;
VectorDualSort(mandist,nummandist, SORT_DESCENDING);
VectorDualSort(eucdist,numeucdist, SORT_DESCENDING);
ReDim manres(NCases,2),eucres(NCases,2),mahares(NCases,5);
for i:=1 to nselected do
begin
manres(i,1):=nummandist(i);
manres(i,2):=mandist(i);
eucres(i,1):=numeucdist(i);
eucres(i,2):=eucdist(i);
end;
colstr$:='Case no.|distance';
rowstr$:='';
NewScrollsheet (nselected, 2, manres, 'Manhattan dist.', rowstr$, colstr$);
colstr$:='Case no.|distance';
rowstr$:='';
NewScrollsheet (nselected, 2, eucres, 'Euclidean dist.', rowstr$, colstr$);
NewScrollsheet (count, count, missvar, 'Missing values between variables', vnamestr$,
vnamestr$);
MatrixCrossProductofDev(mahadata, 1, dmat); { Bortz 1, p. 643 }
MatrixInverse(dmat,invdmat);
ReDim meandiff(count,1),meandifftransp(1,count);
Redim resmat1(1,count),resmat2(1,1),resmat3(1,1);
n:=nselected;
p:=count;
for i:=1 to nselected do
begin
for j:=1 to count do
begin
meandiff(j,1):=mahadata(i,j)-means(varlist(j));
meandifftransp(1,j):=mahadata(i,j)-means(varlist(j));
end;
MatrixMultiply(meandifftransp,invdmat,resmat1);
Matrixmultiply(resmat1,meandiff,resmat2);
MatrixElemMultiply(resmat2, n-1, resmat3);
res:=resmat3(1,1);
mahadist(i):=res;
end;
for i:=1 to nselected do
begin
mahares(i,1):=nummahadist(i);
mahares(i,2):=mahadist(i);
f:=mahadist(i)*(n-p)/(p*(n-1));
mahares(i,3):=f;
mahares(i,4):=1.0-iFDistr(f,p,n-p);
if (mahares(i,4)<0.05) then mahares(i,5):=1 else mahares(i,5):=0;
end;
NewScrollsheet (nselected, 4, mahares, 'Multivariate outliers report (unsorted)',
rowstr$, 'case no.|T sq.|F|p');
if res2>0 then
begin
col:=VarList2(1);
for i:=1 to nselected do
begin
data(i,col):=mahares(i,5);
end;
end;
VectorDualSort(mahadist,nummahadist, SORT_DESCENDING);
for i:=1 to nselected do
begin
mahares(i,1):=nummahadist(i);
mahares(i,2):=mahadist(i);
f:=mahadist(i)*(n-p)/(p*(n-1));
mahares(i,3):=f;
mahares(i,4):=1.0-iFDistr(f,p,n-p)
end;
NewScrollsheet (nselected, 4, mahares, 'Multivariate outliers report (sorted on T
sq.)', rowstr$, 'case no.|T sq.|F|p');
NewIconGraph (ICON_SUNRAYS, 'Multivariate plot', '', 'icon labels are case numbers',
count, nselected, mahadata, mvnamestr$, vnamestr$);
| Back to List of Programs |
![[StatSoft]](../../../images/sssmall.gif)
2300 East 14th Street, Tulsa, OK 74104
Phone: (918) 749-1119; Fax: (918) 749-2217
e-mail: info@statsoft.com
©Copyright StatSoft, Inc., 1984-2004.
StatSoft, StatSoft logo, STATISTICA, SEWSS, SEDAS, Data Miner, SEPATH and GTrees are trademarks of StatSoft, Inc.