Wednesday, February 13, 2008

Netflix Challenge - Reading all the data into SAS

The following codes reads all the data from 17000+ files into a SAS dataset. The programming involves some important data step properties - this is not to encourage to compete on the challenge. The solution to the challenge problem involves for application to be in C or C++. I was curious about the data, its data structure, and overall about the problem.

===========================================================================


libname Net "C:\Documents and Settings\All Users\Documents\NetFlixChallenge\";

libname Netdown "C:\Documents and Settings\All Users\Documents\NetFlixChallenge\download\";
libname Nettrain "C:\Documents and Settings\All Users\Documents\NetFlixChallenge\training_set\";

data netdown.movies1(compress=yes);
infile "C:\Documents and Settings\All Users\Documents\NetFlixChallenge\download\movie_titles.txt" dsd dlm=","
missover truncover /*obs=100 */ ;
input m_num $ rel_yr $ title $80.
;
run;

%let lengthi=6;

data netdown.movies(drop=prefix1 ); set netdown.movies1;
prefix1=repeat('0',&lengthi-length(m_num));
m_expand='mv_'||trim(prefix1)||trim(m_num);
run;

proc print data=netdown.movies(obs=10); run;

proc sql noprint;
select m_expand into: MID1 -: MID17770
from netdown.movies;
quit;

data netdown.superdat;
length mid $7. cid 8. rank 8. rank_dt $10.;
run;

%macro netdata(tfiles=j);
%do i=1 %to &tfiles;
data netdown.mm (keep=mid);
length mid $7.;
mid1="&&mid&i";
mid=substr(mid1,4,7);
run;

data netdown.add1(keep=cid rank rank_dt);
infile "c:\Documents and Settings\All Users\Documents\NetFlixChallenge\training_set\&&MID&i...txt" dsd dlm=","
missover truncover firstobs=2 ;

input cid rank rank_dt $10.;
keep cid rank rank_dt;
run;

data netdown.add ; if _n_=1 then set netdown.mm; set netdown.add1; run;
proc append base=netdown.superdat data=netdown.add ; run;
%end;
%mend;
%netdata(tfiles=17770);

proc freq data=netdown.superdat;
table mid; run;

data netdown.superdat1(compress=yes); set netdown.superdat;
if mid ne .;
run;

/* work with first 10 movies as a sample */
data netdown.sample1_10; set netdown.superdat1 (obs=10);
run;

proc sql noprint;
select cid into: c1 -: c10
from netdown.sample1_10;
quit;

PROC SQL NOPRINT;
CREATE TABLE netdown.macro_vars AS
SELECT name, value
FROM sashelp.vmacro
WHERE scope='GLOBAL'
ORDER BY 1;
QUIT;

*********** SAMPLE WORK ON FIRST MOVIE ***********************;
%let s1_numc=10; /*number of customers for sample1_10 */

proc sort data=netdown.superdat1 out=netdown.superC; by cid; run;
proc sort data=netdown.sample1_10; by cid; run;

data netdown.sample1_10_merged; merge netdown.superc (in=c) netdown.sample1_10 (in=s); by cid;
if s; run;

proc sql noprint;
create table netdown.superc_sql as
select a.*
from netdown.superc as a left join
netdown.sample1_10 as b
on a.cid=b.cid
where a.rank_dt ne "";
quit;



data netdown.s1_scores;
length mid $7. cid 8. rank 8. rank_dt $10.;
run;


%macro collect_scores(movie_num=mn,num_c=s1_numc);
title "Movie evaluated is &movie_num";
%do i=1 %to &num_c;
data netdown.s1_scores_add;
set netdown.superc;
where cid=&&c&i; /* since i know the customers ids are stored in the macro seq. c(i) = cn */
run;
proc append base=netdown.s1_scores data=netdown.s1_score_add; run;
%end;
%mend;

%collect_scores(movie_num='0000001',num_c=10);
quit;

No comments: