Casewise deletion and mean substitution options are available when the data contains missing values. Maximum subset size and the number of best subsets to display for each subset size can be user-specified. For large problems the number of Scrollsheets produced can exceed the number of discardable windows which may be displayed on the screen at one time (the number of discardable windows can be specified in Window Menu... Scrollsheet Manager). If desired the subsets of predictor variables producing the highest R-Squares can be automatically printed in the Text Output Window as well as in Scrollsheets (note: the automatic printing option in the Output Setup window will not show full details).
The results are in the form of two graphs. A Scrollsheet of this output can also be requested. The first graph shows the maximum R-Square value for each subset size plotted against the subset size and the second graph shows the increase in the maximum R-Square for each subset size compared to the maximum R-Square for the previous subset size, again plotted against the subset size. This output is intended to be an aid in determining which model to look at in more depth. The data file (fat_data.sta) used for this program is available for downloading (download data file now). The data were used in an example on Detecting Multicollinearity in 'SAS System for Regression', 1986 Edition.
Program written, modified, or edited at StatSoft, Inc.}
RandomAccess;
NoDataFileVariableNames;
{make sure that the data can be analyzed}
if NCases > 32767 then begin
DisplayMessageBox (MB_OK, 'Data Too Large',
'This BASIC program can analyse upto 32767 cases. The data has too many cases.');
stop;
end;
{get dependent and predictor variable lists}
redim inde(NVars-1);
if SelectVariables2 ('Variable Selection',1,1,depe,C1,'Dependent variable',
2,NVars-1,inde,NSelected,'Predictor variables') = 0 then stop;
{check for overlapping lists}
for i := 1 to NSelected do
if depe = inde(i) then begin;
DisplayMessageBox(MB_ICONSTOP,'Overlapping Variable Lists',
'The selected dependent variable also appears in the predictor variable list. '+
'Cannot perform multiple regression for these data.');
stop;
end;
{get output option}
window := DisplayButtonBox('Would you like to use the Output Window?',
'Yes, Print Scrollsheets to the Text/Output Window|No, Screen Display Only');
if window = 0 then stop;
{get maximum subset size}
repeatchoice:
if NSelected < 10 then choice := NSelected else choice := 10;
if DisplayNumericInputBox ('Maximum Number of Predictors in Subsets',
'Maximum number of predictors:',choice) = 0 then stop;
if (choice < 1) or (choice > NSelected) then begin
DisplayMessageBox(MB_ICONSTOP,'Invalid Maximum Number of Predictors',
'The maximum number of predictors in subsets must be between 1 and the number '+
'of predictor variables (inclusive).');
GOTO repeatchoice;
end;
{get number of best subsets to show}
option := DisplayButtonBox ('Number of Best Subsets to Display',
'Prompt for user-specified maximum|Show all subsets');
if option = 0 then stop;
if option = 2 then size := 0;
if option = 1 then begin
repeatsize:
size := 10;
if DisplayNumericInputBox ('User-Specified Maximum Number to Display',
'n best subsets to display:', size) = 0 then stop;
if size < 1 then begin
DisplayMessageBox(MB_ICONSTOP,'Invalid Maximum Number to Display',
'The maximum number to display must be greater than 0');
GOTO repeatsize;
end;
end;
{copy raw data to modeldata matrix}
redim modeldata(NCases,NSelected+1);
for i := 1 to NSelected do MatrixCopy (data,1,inde(i),NCases,1,modeldata,1,i);
MatrixCopy (data,1,depe,NCases,1,modeldata,1,NSelected+1);
{check to see if there are any missing values}
missingvals := 0;
redim count(NSelected+1);
redim check(NCases);
for i := 1 to NSelected+1 do begin
MatrixExtract (modeldata,1,i,NCases,1,check);
count(i) := ValCount (check,1,NCases);
if count(i) < NCases then begin
missingvals := 1;
count(i) := 1;
end
else count(i) := 0;
end;
NofC := NCases;
{begin solution to missing values problem if necessary}
if missingvals > 0 then begin
imiss := DisplayButtonBox ('Choose how to deal with missing data',
'Delete Cases With Missing Data|Substitute Missing Values by the Variable Mean');
if imiss = 0 then stop;
{replacement by mean}
if imiss = 2 then begin
for i := 1 to NSelected + 1 do begin
if count(i) = 1 then begin
sum := 0;
nn := 0;
for j := 1 to NCases do begin
if valid(modeldata(j,i)) = 1 then begin
nn := nn + 1;
sum := sum + modeldata(j,i);
end;
xbar := sum/nn;
end;
for j := 1 to NCases do begin
if valid(modeldata(j,i)) = 0 then modeldata(j,i) := xbar;
end;
end;
end;
end;
{casewise deletion}
if imiss = 1 then begin
redim marker(NCases);
for i := 1 to NSelected + 1 do
if count(i) = 1 then
for j:= 1 to NCases do
if valid(modeldata(j,i)) = 0 then marker(j) := 1;
ValSum(marker,1,NCases,NMiss);
NofC := NCases - Nmiss;
redim linedata(1,NSelected+1);
redim tempdata(NCases,NSelected+1);
MatrixCopy(modeldata,1,1,0,0,tempdata,1,1);
redim modeldata(NofC,NSelected+1);
line := 1;
for i := 1 to NCases do
if marker(i) = 0 then begin
MatrixExtract(tempdata,i,1,1,0,linedata);
MatrixSetRow(modeldata,line,linedata);
line := line + 1;
end;
end;
end;
{end of missing values section}
{check to make sure that the correlation matrix for the predictor variables is nonsingular}
redim covars(NofC,NSelected);
redim covars2(NSelected,NSelected);
MatrixCopy (modeldata,1,1,NofC,NSelected,covars,1,1);
MatrixCorrelations (covars,1,covars2);
MatrixDet (covars2,singularitytrap);
if singularitytrap<.000001 then begin
DisplayMessageBox(MB_ICONSTOP, 'Linearly Dependent Predictor Variables',
'The matrix of correlations for the selected predictor variables is singular. '+
'Cannot perform multiple regression for these data.');
stop;
end;
{compute correlation matrix}
redim covars(NSelected+1,NSelected+1);
redim covars2(NSelected+1,NSelected+1);
MatrixCorrelations (modeldata,1,covars2);
redim graphdata(choice);
redim graphaxis(choice);
redim ind2(NVars-1);
total := NSelected;
all := 0;
{==================================================}
{start of 1 to choice of maximum number of predictors in the model loop}
for jj := 1 to choice do begin
if jj > 1 then total := total * (NSelected + 1 - jj) / (jj);
redim output(total,jj+1);
t := 1;
for i := 1 to jj do ind2(i) := i;
{start of 1 to number of combinations of predictors loop}
for jjj := 1 to total do begin
{put a fresh copy of the correlation matrix into covars matrix}
MatrixCopy (covars2,1,1,NSelected+1,NSelected+1,covars,1,1);
{sweep on independent variables}
for i := 1 to jj do MatrixSweep (covars,ind2(i),ind2(i),1);
ssresid := covars(NSelected+1,NSelected+1);
rsq := 1 - ssresid;
{put results in output matrix with current variable numbers}
output(t,1) := rsq;
for i := 1 to jj do output(t,i+1) := inde(ind2(i));
t := t+1;
{create next combination}
if ind2(1) < NSelected - jj + 1 then begin
trap := 0;
for i := jj downto 1 do begin
if trap = 0 then begin
ind2(i) := ind2(i) + 1;
if ind2(i) < NSelected - (jj - i) + 1 then begin
iii := 1;
if i < jj then for ii := i + 1 to jj do begin
ind2(ii) := ind2(i) + iii;
iii := iii+1;
end;
trap := 1;
end;
end;
end;
end;
end;
{end of 1 to number of combinations of predictors loop}
{create Scrollsheet output}
redim rankedrs(total);
redim index(total);
if size >= total then NtoShow := total else NtoShow := size;
if size = 0 then NtoShow := total;
MatrixExtract (output, 1, 1, total, 1, rankedrs);
for i := 1 to total do index(i) := i;
VectorDualSort (rankedrs, index, SORT_DESCENDING);
str$ := 'Best Subsets of Predictors';
if jj < 10 then str2$ := Str(jj,1,0);
if (jj > 9) and (jj < 100) then str2$ := Str(jj,2,0);
if (jj > 99) then str2$ := Str(jj,3,0);
str3$ := SInsert (str$,str2$,17);
if jj = 1 then str3$ := SDelete(str3$,28,1);
redim display(NtoShow,jj+1);
redim rankedrs(NtoShow);
MatrixSetColumn (display,1,rankedrs);
ColNames$ := 'R-Square|';
for i := 2 to jj + 1 do begin
str4$ := 'Variable ';
if i < 11 then str5$ := Str(i-1,1,0) else str5$ := Str(i-1,2,0);
ColNames$ := ColNames$ + str4$ + str5$ + '|';
end;
for i := 1 to NtoShow
do for iii := 2 to jj+1 do display(i,iii) := output(index(i),iii);
show := NewScrollsheet (NtoShow,jj+1,display,str3$,?RowNames$,ColNames$);
for i := 1 to NtoShow
do for iii := 2 to jj+1 do ScrollsheetSetTextValue (show,i,iii,VarName(output(index(i),iii)));
graphaxis(jj) := jj;
graphdata(jj) := display(1,1);
if window = 1 then PrintScrollsheetToOutputWindow (show);
if (jj < choice) and (all = 0) then begin
showchoice := DisplayButtonBox ('Show Next Scrollsheet ?',
'Yes, Show Next|Show All (no further prompts)');
if showchoice = 0 then stop;
if showchoice = 2 then all := 1;
end;
end;
{end of 1 to number of maximum number of predictors in the model loop}
{==================================================}
{create the graphs and scrollsheet}
redim increase(choice-1);
redim scrolldata(choice,3);
{get increases}
For i := 2 to choice do increase(i-1) := graphdata(i) - graphdata(i-1);
{create Scrollsheet data}
MatrixSetColumn (scrolldata, 1, graphaxis);
MatrixSetColumn (scrolldata, 2, graphdata);
scrolldata(1,3) := missing;
for i:= 2 to Choice do scrolldata(i,3) := increase(i-1);
{make first graph}
firstgraph := NewGraph (LINEPLOT,'Plot of Maximum R-Square|for Each Subset Size',
'R-Square','Subset Size',Choice,graphaxis,graphdata);
GraphSetScaling (firstgraph,AX_X,Scaling_Manual_0,0.5,choice+0.5,1);
GraphSetPlotPointStyle (firstgraph, 1, ON, P_Diamond, 7, BLUE);
GraphSetDefaultFont (firstgraph, ?FontName$, 14, ?Color);
{alter the x-axis and get the y-axis size for the second graph}
MatrixElemAdd (graphaxis, 1, graphaxis);
ValMax (increase, 1, choice-1, biggest);
{make second graph}
secondgraph := NewGraph (LINEPLOT,'Plot of Increase in R-Square|from Previous Subset Size',
'R-Square Increase','Subset Size',Choice-1,graphaxis,increase);
GraphSetScaling (secondgraph,AX_X,Scaling_Manual_0,0.5,choice+0.5,1);
GraphSetScaling (secondgraph,AX_Y,Scaling_Manual,0,(biggest*1.1),(biggest/10));
GraphSetPlotPointStyle (secondgraph, 1, ON, P_Diamond, 7, BLUE);
GraphSetDefaultFont (secondgraph, ?FontName$, 14, ?Color);
{combine the graphs}
finalgraph := NewGraph (IGNOREDPLOT, 'Display of Output', ?Left$, ?Bottom$, 0, XValues, YValues);
GraphEmbedGraph (finalgraph, firstgraph, FALSE, FALSE, ?Mapping, 0, 100, 50, 0, FALSE);
GraphEmbedGraph (finalgraph, secondgraph, FALSE, FALSE, ?Mapping, 50, 100, 100, 0, FALSE);
DisplayGraph (finalgraph);
{now make the scrollsheet (if requested)}
outchoice := DisplayMessageBox (MB_YESNOCANCEL+MB_ICONQUESTION, 'More Output',
'Would you like to see the Scrollsheet of this output also?');
if outchoice <> IDYES then stop;
scrollout := NewScrollsheet (Choice,3,scrolldata,'Display of Graph
Data',?RowNames$,?ColumnNames$);
ScrollsheetSetColumnName (scrollout, 1, 'Subset', 'Size');
ScrollsheetSetColumnName (scrollout, 2, 'R-Square', 'Maximum');
ScrollsheetSetColumnName (scrollout, 3, 'R-Square', 'Increase');
ScrollsheetSetColumnFormat (scrollout, 1, SCF_INTEGER, 8);
ScrollsheetSetColumnFormat (scrollout, 3, SCF_DECIMAL, 8);
if window = 1 then PrintScrollsheetToOutputWindow (scrollout);
| Back to List of Programs |
![[StatSoft]](../../../images/sssmall.gif)
2300 East 14th Street, Tulsa, OK 74104
Phone: (918) 749-1119; Fax: (918) 749-2217
e-mail: info@statsoft.com
©Copyright StatSoft, Inc., 1984-2004.
StatSoft, StatSoft logo, STATISTICA, SEWSS, SEDAS, Data Miner, SEPATH and GTrees are trademarks of StatSoft, Inc.