M a d L I N Q : L a r g e - S c a l e D i s r i b u t e d M a t r i x C o m p u t a t i o n f o r t h e C l o u d B y Z h e n g p i n g Q i a n , X i u w e i C h e n , N a n x i K a n g , Mi n g c h e n g C h e n , Y u a n Y u , T h o m a s Mo s c i b r o d a , Z h e n g Z h a n g M i c r o s o f t R e s e a r c h A s i a , S h a n g h a i J i a o t o n g U n i v e r s i t y , Mi c r o s o f t R e s e a r c h S i l i c o n V a l l e y P r e s e n t e r : H a i k a l P r i b a d i ( h p 3 5 6 )
M a d L I N Q M o t i v a t i o n C o n t r i b u t i o n E v a l u a t i o n F u t u r e Wo r k
M o t i v a t i o n
D i s t r i b u t e d E n g i n e s – G o o d a n d B a d S u c c e s s – S t r o n g s u b s e t o f r e l a t i o n a l o p e r a t o r s ● F i l t e r i n g , p r o j e c t i o n , a g g r e g a t i o n , s o r t i n g a n d j o i n s ● E x t e n s i o n s v i a u s e r - d e f i n e d f u n c t i o n s – A d o p t s d i r e c t - a c y c l i c - g r a p h ( D A G ) e x e c u t i o n m o d e l ● S c a l a b l e a n d r e s i l i e n t P r o b l e m a t i c – D e e p a n a l y s i s a n d m a n i p u l a t i o n o f d a t a – R e q u i r e s l i n e a r a l g e b r a a n d m a t r i x c o m p u t a t i o n
D i s t r i b u t e d E n g i n e s - P r o b l e m L i n e a r a l g e b r a a n d m a t r i x c o m p u t a t i o n – M a c h i n e L e a r n i n g ● M u l t i p l i c a t i o n , S V D , L U f a c t o r i z a t i o n ● C h o l e s k y f a c t o r i z a t i o n – R a n k i n g o r c l a s s i f i c a t i o n a l g o r i t h m – S o c i a l w e b - m i n i n g o r i n f o r m a t i o n r e t r i e v a l – H a r d t o c a p t u r e i n r e l a t i o n a l a l g e b r a o p e r a t o r s – R e a l w o r l d m a t r i x a n d d a t a m i n i n g a l g o r i t h m s a r e e x t r e m e l y h a r d t o i m p l e m e n t
H i g h P e r f o r m a n c e C o m p u t i n g S o l u t i o n t o m a t r i x c o m p u t a t i o n H o w e v e r – I n v o l v e s l o w l e v e l p r i m i t i v e s t o d e v e l o p a l g o r i t h m s – S i n g l e P r o c e s s M u l t i p l e D a t a ( S P M D ) e x e c u t i o n m o d e l – P r o b l e m m a i n t a i n e d i n m e m o r y – C o n s t r a i n s p r o g r a m m a b i l i t y , s c a l a b i l i t y a n d r o b u s t n e s s – N o t a p p l i c a b l e f o r w e b - s c a l e b i g d a t a a n a l y s i s
H A M A – M a t r i x O p e r a t i o n o n M a p R e d u c e R e m o v e s t h e c o n s t r a i n t o f p r o b l e m s i z e M a p R e d u c e i n t e r f a c e i s r e s t r i c t i v e – D i f f i c u l t t o p r o g r a m r e a l w o r l d l i n e a r a l g e b r a – I m p l i c i t l y s y n c h r o n i z e d – F a i l s t o t a k e a d v a n t a g e o f s e m a n t i c s o f m a t r i x o p e r a t i o n s
C o n t r i b u t i o n
M a t r i x C o m p u t a t i o n S y s t e m U n i f i e d p r o g r a m m i n g m o d e l – M a t r i x d e v e l o p m e n t l a n g u a g e – A p p l i c a t i o n d e v e l o p m e n t l i b r a r y I n t e g r a t e w i t h d a t a - p a r a l l e l c o m p u t i n g s y s t e m M a i n t a i n s c a l a b i l i t y a n d r o b u s t n e s s o f D A G – F i n e - g r a i n e d p i p e l i n i n g ( F G P ) – L i g h t w e i g h t f a u l t - t o l e r a n c e p r o t o c o l
P r o g r a m m i n g M o d e l - M a t r i x D e v e l o p m a t r i x a l g o r i t h m s M a t r i x o p t i m i z a t i o n s B a s e d o n t i l e a b s t r a c t i o n – S q u a r e s u b - m a t r i c e s – I n d e x e d g r i d o f t i l e s f o r m a m a t r i x – M a t r i c e s e x p r e s s e d n a t u r a l l y – S t r u c t u r a l c h a r a c t e r i s t i c o f m a t r i c e s
P r o g r a m m i n g M o d e l - M a t r i x M a t r i x m u l t i p l i c a t i o n c o d e e x a m p l e : MadLINQ.For(0, m, 1, i => { MadLINQ.For(0, p, 1, j => { c[i, j] = 0; MadLINQ.For(0, n, 1, k => c[i, j] += a[i, k] * b[k, j]); }); });
P r o g r a m m i n g M o d e l - M a t r i x C h o l e s k y t i l e - a l g o r i t h m i m p l e m e n t a t i o n MadLINQ.For(0, n, 1, k => { L[k, k] = A[k, k].DPOTRF(); MadLINQ.For(k + 1, n, 1, l => L[l, k] = Tile.DTRSM(L[k, k], A[l, k])); MadLINQ.For(k + 1, n, 1, m => { A[m, m] = Tile.DSYRK(A[m, k], A[m, m]); MadLINQ.For(m + 1, n, 1, l => A[l, m] = Tile.DGEMM(A[l, k], A[m, k], A[l, m])); }); });
P r o g r a m m i n g M o d e l – A p p l i c a t i o n e x . C o l l a b o r a t i v e F i l t e r i n g – B a s e l i n e a l g o r i t h m w i t h d a t a s e t f r o m N e t f l i x – D a t a s e t : m a t r i x R r e c o r d s u s e r s ' r a t i n g s o n m o v i e s ● s i m i l a r i t y = R x R t ( s p a r s e m a t r i x ) ● s c o r e s = s i m i l a r i t y x R ( d e n s e m a t r i x ) Matrix similarity = R.Multiply(R.Transpose()); Matrix scores = similarity.Multiply(R).Normalize();
P r o g r a m m i n g M o d e l – A p p l i c a t i o n e x . M a r k o v C l u s t e r i n g – A d j a c e n c y m a t r i x t o r e p r e s e n t g r a p h s MadLINQ.For(0, DEPTH, 1, i => { // Expansion G = G.Multiply(G); // Inflate: element-wise xˆ2 and row-based normalization G = G.EWiseMult(G).Normalize().Prune(); });
P r o g r a m m i n g M o d e l – A p p l i c a t i o n e x . MadLINQ.For(0, T, 1, i => R e g u l a r i z e d L a t e n t { S e m a n t i c I n d e x ( R L S I ) // Update U Matrix S = V.Multiply(V.Transpose()); – w e b - m i n i n g a l g o r i t h m t o Matrix R = D.Multiply(V.Transpose()); // Assume tile size >= K d e r i v e a p p r o x i m a t e t o p i c MadLINQ.For(0, U.M, 1, m => m o d e l f o r We b d o c s U[m, 0] = Tile.UpdateU(S[0,0], R[m,0])); // Update V – O n l y 1 0 L o C w h i l e Matrix Phi = U.Transpose().Multiply(D); V = U.Transpose() S C O P E ' s a d o p t i o n o f .Multiply(U) M a p R e d u c e t a k e s .Add(TiledMatrix<double>.EYE(U.N, lambda2)) 1 1 0 0 + L o C .CholeskySolve(Phi); });
I n t e g r a t i o n w i t h D r y a d L I N Q // The input datasets var ratings = PartitionedTable.Get<LineRecord>(NetflixRating); // Step 1: Process the Netflix dataset in DryadLINQ Matrix R = ratings.Select(x => CreateEntry(x)).GroupBy(x => x.col) .SelectMany((g, i) => g.Select(x => new Entry(x.row, i, x.val))) .ToMadLINQ(MovieCnt, UserCnt, tileSize); // Step 2: Compute the scores of movies for each user Matrix similarity = R.Multiply(R.Transpose()); Matrix scores = similarity.Multiply(R).Normalize(); // Step 3: Create the result report var result = scores.ToDryadLinq(); result.GroupBy(x => x.col).Select(g => g.OrderBy().Take(5));
Recommend
More recommend