Some times we have two datasets with the same structure. For example, one has collected data in different periods and saved in separate in different data sets. To do analysis, one will first concatenate these data. This looks simple, however, some times you get error message because a variable say study_id in one dataset is numeric and in another dataset is character. This is actually good because you got an error message, so that you can change the variable to same type. However, if a date variable in one data is datetime format and another is in date format, you may get no error message when you concatenate your data. There fore, we should always check the agreement of two dataset before we concatenate them together. One simple solution is to use proc contents procedure to print the contents of two data set and compare the variables one by one. Can we let SAS automatically output a table to help us? we are now to create a macro for this purpose.
We want our macro to create a table with the first 4 variables as below:
1. varname : includes all variables in all data sets
2. in_all_data : a variable has two values, 1 means this variable in all datasets, 0 means not in all datasets.
3. match_main : the variable in all data set have same type and same length.
4. match_all : the variable in all data set have same type, length and format
and then follow the columns type1, length1, format1, type2, length2, format2 etc... for data1, data2 ...
In this macro, we require the user input the data list, and the name of the output table.
The data list is in this format: work.classJan work.classFeb work.classMar
That is, we need to include the lib name in the data and we use empty space to separate data.
Here are the sections for the macro:
1. after read in the data list, we create datasets to store variable's name, type, length and format for each data, and called them _columns1, _columns2 ... corresponding to the first, second, ... data in the list.
To merge these new created data _columns1, _columns2 into a big data, but they have same variable names : type, length and format,
a: change variable name to type1, length1, format1 , type2 length2 format2 ... correspond the to
_columns1, _columns2 ...
b: merge data by name, and name this big data _columns_all
2. if a variable exists in all data assign in_all=1 else assign in_all=0
To do that we check if there is missing values in type1, type2, ... because an exist variable must have a type.
3. if a variable exists in all data set, check if all dataset have the same type and length, assign match_main=1 if the answer is yes.
4. if match_main=1 then we check if their format are the same, if the answer is yes assign match_all=1
5. output the final data.
That looks easy, and the following macro only have 90 rows.
we include 3 macros inside this macro:
1. macro macvarlen is used to calculate the length of a name list separated by a delimiter.
2. the pair macro is a modification from the pair macro in last post, we use this macro to create the condition used in the if clause, for example type1=type2 and type2=type3 and type3=type4
3. macro expandname used to create name list, by given a key , start number, stop number.
for example, %expandanme(type, 1, 3, outputlist), the output list will be: "type1 type2 type3" without the quotation mark.
Here is the macro:
/*-------------------------------------------------------------------*/
/* D_CHK_DATAMTCH v1.0 */
/* Check if variables are in all data sets */
/* if variables with same name having same type, length and format */
/* by Elite Bistatistics */
/*-------------------------------------------------------------------*/
/* */
/* This macro is provided "as is" by Elite Biostatistics. There */
/* are no warranties, expressed or implied, as to merchantability or */
/* fitness for a particular purpose regarding the materials or code */
/* contained herein. The company is not responsible for errors in */
/* this material as it now exists or will exist, nor does the company*/
/* provide technical support for it. */
/* */
/*-------------------------------------------------------------------*/
/* Questions or problem reports concerning this material may be */
/* addressed to the company by email: */
/* EB@elitebiostats.com use this subject field: */
/* Comments for SAS code */
/* */
/*-------------------------------------------------------------------*/
/* Date Last Updated: 01Sep2012 */
/*-------------------------------------------------------------------*/
/* Purpose */
/* This macro is to compare multiple data sets whether they have */
/* common variables and of the common variables whether they have */
/* same type, length, and format. */
/* input : */
/* datalist: the list of datasets with lib names separated by space */
/* e.g work.classJan work.classFeb work.classMar */
/* output data: user specified */
/* */
/*-------------------------------------------------------------------*/
%macro D_CHK_DATAMTCH(datalist, outdata);
%*------------------------------------------------------------*;
%* macros used in D_CHK_DATMTCH *;
%macro macvarlen(list, delim, num);
%* calculate the length of the list by the delimeter delim *;
%let i=1;
%do %while(%scan(&list, &i, &delim)^=);
%let i=%eval(&i+1);
%end;
%let &num=%eval(&i-1);
%mend;
%macro pair(lista, listb, operator, connectby, outlist);
%*------------------------------------------------------------*;
%* lista, listb are two lists with same number of items *;
%* operator is used to pair items in lista and listb *;
%* connectby is the string to connect the paires *;
%* For Example *;
%* pair(type1 type2, case1 case2, =, and, mylist) *;
%* mylist is created as *;
%* type1=case1 and type2=case2 *;
%*------------------------------------------------------------*;
%let i=1;
%do %while(%scan(&lista, &i)^=);
%let i=%eval(&i+1);
%end;
%let n=%eval(&i-1);
%let newlist=;
%do i=1 %to &n;
%let item1=%scan(&lista,&i);
%let item2=%scan(&listb,&i);
%if &i=1 %then %let newlist=&item1.&operator.&item2;
%if &i>1 and &i<%eval(&n-1) %then %let newlist=&newlist &item1.&operator.&item2 &connectby;
%if &i>1 and &i=&n %then %let newlist=&newlist &item1.&operator.&item2;
%end;
%let &outlist=&newlist;
%mend;
%macro expandname(key, num1, num2, outputlist);
%*------------------------------------------------------------*;
%* output a list with the key string follow by numbers *;
%* expandname(type, 1, 3, mylist) *;
%* mylist is created as *;
%* type1 type2 type3 *;
%*------------------------------------------------------------*;
%let &outputlist=;
%let nlist=;
%do i=&num1 %to &num2;
%let nlist=&nlist &key&i;
%end;
%let &outputlist=&nlist;
%mend;
%* section one, create a big table named _columns_all to *;
%* include all variable names and their types, length, format *;
%let datalist=%upcase(&datalist);
%let columndatalist=;
%let data_n=;
%macvarlen(&datalist,' ',data_n);
%put data_n is &data_n;
%do i=1 %to &data_n;
%let data=%scan(&datalist,&i, ' ');
%let libname=%scan(&data,1,'.');
%let memname=%scan(&data,2,'.');
%put &libname &memname;
data _columns&i;
set sashelp.vcolumn;
keep name type length format;
rename type=type&i length=length&i format=format&i;
where libname="&libname" and memname="&memname";
run;
%end;
%let columndatalist=;
%expandname(_columns, 1, &data_n, columndatalist);
%put columndatalist= &columndatalist;
data _columns_all;
merge &columndatalist;
by name;
run;
%* section two, create the condition for if statements *;
%* using macro expandname, and pair *;
%* assign in_all, match_main and match_all values *;
%* in_all : 1 indicate the variable in all data *;
%* match_main : 1 indicates all data has the same type and *;
%* length for the variable *;
%* match_all : 1 indicates all data has the same type, length *;
%* and format for the variable *;
%let type1=;
%let type2=;
%let length1=;
%let length2=;
%let format1=;
%let format2=;
%expandname(type, 1, %eval(&data_n-1), type1);
%expandname(type, 2, &data_n, type2);
%expandname(length, 1, %eval(&data_n-1), length1);
%expandname(length, 2, &data_n, length2);
%expandname(format, 1, %eval(&data_n-1), format1);
%expandname(format, 2, &data_n, format2);
%put type1=&type1;
%put type2=&type2;
%let cmptype=;
%let cmplength=;
%let cmpformat=;
%pair(&type1, &type2, =, and, cmptype);
%pair(&length1, &length2, =, and, cmplength);
%pair(&format1, &format2, =, and, cmpformat);
data &outdata;
length in_all match_main match_all 8;
set _columns_all;
array type{&data_n} type1 - type&data_n;
in_all=1;
do i=1 to &data_n;
if type(i)='' then in_all=0;
end;
if in_all =1 and &cmptype and &cmplength then match_main=1; else match_main=0;
if match_main=1 and &cmpformat then match_all=1; else match_all=0;
drop i;
run;
%mend;
To test the macro, we first create two empty dataset and pass them to the macro.
data test1;
length v1 v2 v3 8 v4 8 v5 8 v7 $10 v8 $12 v9 8;
format v1 v2 v3 best32. v4 mmddyy10. v5 datetime13.;
stop;
run;
data test2;
length v1 $12 v2 v3 8 v4 4 v5 8 v7 $10 v8 $12;
format v1 $12. v2 v3 best32. v4 mmddyy10. v5 mmddyy10.;
stop;
run;
%D_CHK_DATAMTCH(work.test1 work.test2, testout);