Literary Data: Some Approaches Andrew Goldstone - - PowerPoint PPT Presentation

literary data some approaches
SMART_READER_LITE
LIVE PREVIEW

Literary Data: Some Approaches Andrew Goldstone - - PowerPoint PPT Presentation

Literary Data: Some Approaches Andrew Goldstone http://www.rci.rutgers.edu/~ag978/litdata February 26, 2015. Functions, abstraction, modularity. homework 5 Questions? formal parameter param is now bound to the argument x function_name <-


slide-1
SLIDE 1

Literary Data: Some Approaches

Andrew Goldstone http://www.rci.rutgers.edu/~ag978/litdata February 26, 2015. Functions, abstraction, modularity.

slide-2
SLIDE 2

homework 5

▶ Questions?

slide-3
SLIDE 3

functions

▶ map inputs to outputs ▶ possibly with side effects

(as rarely as possible) function_name <- function (param) { # anything... # at... # all... } function_name(x) formal parameter param is now bound to the argument x

slide-4
SLIDE 4

functions

▶ map inputs to outputs ▶ possibly with side effects

(as rarely as possible) function_name <- function (param) { # anything... # at... # all... } function_name(x)

▶ formal parameter param is now bound to the argument x

slide-5
SLIDE 5

twice <- function (s) { str_c(s, s, sep=" ") } twice("ha") twice(str_c(letters, collapse=" "))

slide-6
SLIDE 6

function_name <- function (param1, param2, ...) { ... } many_times <- function (s, n) { result <- s for (j in 2:n) { result <- str_c(result, s, sep=" ") } result } many_times("ha", 5)

[1] "ha ha ha ha ha"

slide-7
SLIDE 7

binding named parameters

many_times("ha", n=5)

[1] "ha ha ha ha ha"

many_times(n=5, s="ha") # !!

[1] "ha ha ha ha ha"

slide-8
SLIDE 8

scope

many_times("no", 2)

[1] "no no"

result Error in eval(expr, envir, enclos): object 'result' not found

slide-9
SLIDE 9

for, begone

rep("O", 4)

[1] "O" "O" "O" "O"

▶ use rep to rewrite many_times without for

slide-10
SLIDE 10

many_times <- function (s, n) { str_c(rep(s, n), collapse=" ") }

slide-11
SLIDE 11

abstraction

three_weeks <- readLines("three-weeks-gutenberg.txt") # get the body text metadata_start <- match aararrgggh not again

slide-12
SLIDE 12

gutenberg_body <- function (ll, start_pat, end_pat) { start <- grep(start_pat, ll) end <- grep(end_pat, ll) start <- start[1] end <- end[1] ll[start:end] } three_weeks_body <- gutenberg_body(three_weeks, "^CHAPTER", "^THE END")

slide-13
SLIDE 13

three_weeks_words <- tolower unlist strsp aarrrgh

Homework…

featurize <- function (ll) { # old familiar friends }

slide-14
SLIDE 14

three_weeks_words <- tolower unlist strsp aarrrgh

Homework…

featurize <- function (ll) { # old familiar friends }

slide-15
SLIDE 15

global and local

three_weeks_body[1]

[1] "CHAPTER I"

three_week_printer <- function (separator) { three_weeks_body <- c("Week 1", "Week 2", "Week 3") str_c(three_weeks_body, collapse=separator) } three_week_printer("...") # but what if...?

[1] "Week 1...Week 2...Week 3"

three_weeks_body[1]

[1] "CHAPTER I"

slide-16
SLIDE 16

scope: even more

smoosh <- function (words1, words2) { helper <- function (ws) { str_c(ws, collapse=" ") } str_c(helper(words1), helper(words2), sep=" + ") } smoosh(c("uh", "huh"), c("that's", "the"))

[1] "uh huh + that's the"

helper(c("uh", "huh")) Error in eval(expr, envir, enclos): could not find function "helper"

slide-17
SLIDE 17

closure

x <- 10 f <- function (y) { x + y } f(5)

[1] 15

x <- 100 f(5)

[1] 105

slide-18
SLIDE 18

early escape

first_few <- function (ll) { if (length(ll) < 4) { return(ll) } ll[1:4] } first_few(1:2)

[1] 1 2

first_few(1:200)

[1] 1 2 3 4

slide-19
SLIDE 19

recursion

Algorithm: QuickSort.

  • 1. Choose the first element as “pivot.”
  • 2. Partition the vector into two pieces by comparing to the pivot.
  • 3. QuickSort the two pieces.
slide-20
SLIDE 20

qsort <- function (xs) { if (length(xs) <= 1) { return(xs) } p <- xs[1] rest <- xs[-1] left <- rest[rest <= p] right <- rest[rest > p] c(qsort(left), p, qsort(right)) } qsort(c(4, 2, 3, 5, 1))

[1] 1 2 3 4 5

slide-21
SLIDE 21

reuse and refine reuse

extract_years <- function (pubdates) { gsub("^\\D*(\\d{4}).*$", "\\1", pubdates) } ecco <- read.csv("ecco-headers.csv", as.is=T, encoding="UTF-8") ecco_years <- extract_years(ecco$pubdate) eebo <- read.csv("eebo-headers.csv", as.is=T, encoding="UTF-8") eebo_years <- extract_years(eebo$pubdate) all(grepl("^\\d{4}$", eebo_years))

[1] FALSE

slide-22
SLIDE 22

rats!

extract_years <- function (pubdates) { result <- gsub("^\\D*(\\d{4}).*$", "\\1", pubdates) missing_year <- grep("^\\d{4}$", result, invert=T) result[missing_year] <- NA result } eebo_years <- extract_years(eebo$pubdate) sum(is.na(eebo_years))

[1] 540

eebo_years <- eebo_years[!is.na(eebo_years)] # bye ecco_years <- extract_years(ecco$pubdate) sum(is.na(ecco_years)) # cool

[1] 0

slide-23
SLIDE 23

eebo_years_table <- table(eebo_years) eebo_sorted <- names sort descending wait top_hits <- function (xs, n=10) { sorted <- sort(table(xs), decreasing=T) names(sorted)[1:n] } top_hits(eebo_years)

[1] "1660" "1642" "1641" "1659" "1689" "1680" "1681" [8] "1688" "1685" "1682"

slide-24
SLIDE 24

modularity

years_decades <- function (years) { gsub("\\d$", "0s", years) } top_hits(years_decades(eebo_years))

[1] "1680s" "1640s" "1690s" "1650s" "1660s" "1670s" [7] "1630s" "1620s" "1600s" "1610s"

top_hits(years_decades(ecco_years))

[1] "1790s" "1780s" "1770s" "1760s" "1750s" "1710s" [7] "1740s" "1730s" "1700s" "1720s"

slide-25
SLIDE 25

top_hits(featurize(three_weeks_body)) # reuse!

[1] "the" "and" "of" "to" "he" "a" "his" "was" [9] "in" "her"

slide-26
SLIDE 26

function composition is chaining

eebo_years %>% years_decades %>% top_hits three_weeks_body %>% featurize %>% top_hits