Bogdan Vasilescu (CMU, ISR) Prem Devanbu (UCDavis) Casey Casalnuovo (UCDavis)
Recovering Clear, Natural Identifiers from Obfuscated (JavaScript) Names
@b_vasilescu @devanbu
Recovering Clear, Natural Identifiers from Obfuscated (JavaScript) - - PowerPoint PPT Presentation
Bogdan Vasilescu Casey Casalnuovo Prem Devanbu (CMU, ISR) (UCDavis) (UCDavis) @b_vasilescu @devanbu Recovering Clear, Natural Identifiers from Obfuscated (JavaScript) Names @b_vasilescu Today var geom2d = function() { var t =
Bogdan Vasilescu (CMU, ISR) Prem Devanbu (UCDavis) Casey Casalnuovo (UCDavis)
@b_vasilescu @devanbu
@b_vasilescu
var geom2d = function() { var t = numeric.sum; function r(n, r) { this.x = n; this.y = r; } u(r, { P: function e(n) { return t([ this.x * n.x, this.y * n.y ]); } }); function u(n, r) { for (var t in r) n[t] = r[t]; return n; } return { V: r }; }();
@b_vasilescu
var geom2d = function() { var t = numeric.sum; function r(n, r) { this.x = n; this.y = r; } u(r, { P: function e(n) { return t([ this.x * n.x, this.y * n.y ]); } }); function u(n, r) { for (var t in r) n[t] = r[t]; return n; } return { V: r }; }();
@b_vasilescu
var geom2d = function() { var t = numeric.sum; function r(n, r) { this.x = n; this.y = r; } u(r, { P: function e(n) { return t([ this.x * n.x, this.y * n.y ]); } }); function u(n, r) { for (var t in r) n[t] = r[t]; return n; } return { V: r }; }(); var geom2d = function() { var sum = numeric.sum; function Vector2d(x, y) { this.x = x; this.y = y; } mix(Vector2d, { P: function dotProduct(vector) { return sum([ this.x * vector.x, this.y * vector.y ]); } }); function mix(dest, src) { for (var k in src) dest[k] = src[k]; return dest; } return { V: Vector2d }; }();
“Instead of imagining that our main task is to instruct a
computer what to do, let us concentrate rather on explaining to human beings what we want a computer to do.” [Don Knuth]
@b_vasilescu
code readability, reusability, maintainability
@b_vasilescu
code readability, reusability, maintainability
@b_vasilescu
code readability, reusability, maintainability [many]
@b_vasilescu
Hmmmm….
Tiger, Tiger burning bright In the forests
What immortal hand or eye, Could frame thy fearful symmetry?
Variable Name Guesser (AUTONYM)
Variable Name Guesser (AUTONYM)
Minified Source Code
function u(n, r) { for (var t in r) n[t] = r[t]; return n; }
Variable Name Guesser (AUTONYM)
Minified Source Code Un-Minified Source Code
function u(n, r) { for (var t in r) n[t] = r[t]; return n; } function mix(dest, src) { for (var k in src) dest[k] = src[k]; return dest; }
Moses SMT Pre- processing Post- processing
Autonym
Minified Source Code Un-Minified Source Code
Moses SMT Pre- processing Post- processing
Autonym
What’s the relevance of Machine Translation?
distorted message
channel model distorted message
channel model language model distorted message
channel model language model distorted message
channel model language model distorted message
channel model language model distorted message (for a given ) B a y e s t h e
e m
channel model language model distorted message Language model Translation (channel distortion) model
Language model
Translation model
Language model
Translation model
Aligned French-English Corpus
English Corpus
Language model
Translation model
Aligned French-English Corpus
English Corpus
Language model
Translation model
Aligned French-English Corpus
Clear Code Corpus
Language model
Translation model
Aligned Clear-Minified Code Corpus
Clear Code Corpus
Language model
Translation model
Aligned Clear-Minified Code Corpus
GitHub + minifier
EN: I know what you named your identifiers! NL: Ik weet wat je je ID's genoemd!
Natural language: non-trivial alignment
EN: I know what you named your identifiers! NL: Ik weet wat je je ID's genoemd!
Natural language: non-trivial alignment
EN: I know what you named your identifiers! NL: Ik weet wat je je ID's genoemd! function u(n, r) { function mix(dest, src){
Natural language: non-trivial alignment
EN: I know what you named your identifiers! NL: Ik weet wat je je ID's genoemd! function u(n, r) { function mix(dest, src){
Natural language: non-trivial alignment
Minification: straightforward alignment
function r(n, r) { for (var t in r) n[t] = r[t]; return n; }
function r(n, r) { for (var t in r) n[t] = r[t]; return n; }
function r(n, r) { for (var t in r) n[t] = r[t]; return n; }
function r(n, r) { for (var t in r) n[t] = r[t]; return n; }
function r(n, r) { for (var t in r) n[t] = r[t]; return n; }
function mix(dest, src) { }
function r(n, r) { for (var t in r) n[t] = r[t]; return n; }
function mix(dest, src) { }
Scope analysis
function r(n, r) { for (var t in r) n[t] = r[t]; return n; } function mix(dest, src) { for (var k in list) dest[k] = list[k]; return dest; }
(Sentence-by-sentence translation)
function r(n, r) { for (var t in r) n[t] = r[t]; return n; } function mix(dest, src) { for (var k in list) dest[k] = list[k]; return dest; }
(Sentence-by-sentence translation)
Language model scoring
Idea: try all, let language model decide which is more natural, on average, across ALL lines
Language model Translation model
[Raychev et al, 2015]
[Raychev et al, 2015]
(globals don’t change)
var geom2d = function() { var t = numeric.sum; function r(n, r) { this.x = n; this.y = r; } ... var geom2d = function() { var sum = numeric.sum; function Vector2d(x, y) { this.x = x; this.y = y; } ...
0.00 0.25 0.50 0.75 1.00 ym (Local) ym (All) JSNice (Local) JSNice (All) JSNaughty (Local) % names recovered − 2149 files
Local Global Autonym JSNice
0.00 0.25 0.50 0.75 1.00 0.00 0.25 0.50 0.75 1.00 Autonym File Accuracy JSNice File Accuracy 20 40 60 Frequency
Moses SMT Pre- processing Post- processing
Autonym
Moses SMT Pre- processing Post- processing
Autonym
0.00 0.25 0.50 0.75 1.00 ym (Local) ym (All) JSNice (Local) JSNice (All) JSNaughty (Local) JSNaughty (All) % names recovered − 2149 files
Autonym JSNice JSNaughty Global
1 module . exports = http . c r e a t e S e r v e r ( function ( e ,
r ) {
2
var t ;
3
var i = new stream . Stream ( ) ;
4
. . .
5
var n = " " ;
6
csv ( ) . fromStream ( e ) . on ( " data " , function ( e , r ) {
7
i f ( ! t ) { . . . }
8
var a = {};
9
( . zip ( t , e ) ) . each ( function ( e ) { . . . } ) ;
10
i . emit ( " data " , n + JSON. s t r i n g i f y ( a ) ) ;
11
n = " ," ;
12
} ) . on ( " end " , function ( e ) {
13
i . emit ( " data " , " ]} " ) ;
14
i . emit ( " end " ) ;
15
} ) . on ( " error " , function ( e ) {
16
i . emit ( " error " , e ) ;
17
c o n s o l e . log ( " csv error " , e . message ) ;
18
} ) ;
19
} ) ;
Original: error AUTONYM err JSNICE err JSNAUGHTY err Original: tuple AUTONYM tuple JSNICE key JSNAUGHTY tuple Original: headers AUTONYM headers JSNICE headers JSNAUGHTY headers Original: jsonStream AUTONYM i JSNICE s JSNAUGHTY s Original: req AUTONYM req JSNICE q JSNAUGHTY req Original: res AUTONYM res JSNICE r JSNAUGHTY res Original: separator AUTONYM data JSNICE sep JSNAUGHTY sep
Input program (minified) Output program (un-minified) Moses SMT Optional: Pre-processor Post- processor
Autonym
JSNice
Aligned clear-text/ minified corpus Language model Translation model Clear-text corpus Model training
This material is based upon work supported by the National Science Foundation under Grant No. 1414172
https://github.com/bvasiles/jsNaughty
minified JS, decompiled C
technology (Moses)
to tokenization and scope analysis
JSNice on local names, on average
# Python if n % 3 == 0: Pseudo-code: if n is divisible by 3 // C# Console . WriteLine ( "Hello World!" ) ; // Java System . out . println ( "Hello World!" ) ;
code to pseudocode
porting C# to Java
# Python if n % 3 == 0: Pseudo-code: if n is divisible by 3 // C# Console . WriteLine ( "Hello World!" ) ; // Java System . out . println ( "Hello World!" ) ;
code to pseudocode
porting C# to Java
// Java public void findResultEdges() { for (Iterator it = dirEdgeList.iterator(); it.hasNext();) { DirectedEdge de = (DirectedEdge) it.next();…} } // C# public void FindResultEdges() { foreach (DirectedEdge de in _dirEdgeList){…} }
porting Java to C#