Web Scraping with R (2):
Regular Expressions

Alex Sanchez ()
Francesc Carmona ()
GME Department. Universitat de Barcelona
Statistics and Bioinformatics Unit. Vall d’Hebron Institut de Recerca

March 2020

Readme

Disclaimer and acknowledgements

Introduction

When would one need regular expressions?

Functions for Pattern Matching

Detect Patterns

string <- c("Regular", "expression", "examples of R language")
grep("ex", string, value=F)
## [1] 2 3
grep("ex", string, value=T)
## [1] "expression"             "examples of R language"
grepl("ex", string)
## [1] FALSE  TRUE  TRUE

Locate Patterns (1)

string <- c("Regular", "expression", "examples of R language")
x <- regexpr("ex", string)
x
## [1] -1  1  1
## attr(,"match.length")
## [1] -1  2  2
## attr(,"index.type")
## [1] "chars"
## attr(,"useBytes")
## [1] TRUE

Locate Patterns (2)

string <- c("Regular", "expression", "examples of R language")
x <- gregexpr("x*ress", string)
x
## [[1]]
## [1] -1
## attr(,"match.length")
## [1] -1
## attr(,"index.type")
## [1] "chars"
## attr(,"useBytes")
## [1] TRUE
## 
## [[2]]
## [1] 4
## attr(,"match.length")
## [1] 4
## attr(,"index.type")
## [1] "chars"
## attr(,"useBytes")
## [1] TRUE
## 
## [[3]]
## [1] -1
## attr(,"match.length")
## [1] -1
## attr(,"index.type")
## [1] "chars"
## attr(,"useBytes")
## [1] TRUE

Replace Patterns

string <- "He is now 25 years old, and weights 130lbs";
x <- sub("[[:digit:]]", "", string)
x
## [1] "He is now 5 years old, and weights 130lbs"
x <- gsub("[[:digit:]]", "", string)
x
## [1] "He is now  years old, and weights lbs"

Extract Patterns (1)

x <- c("Arkansas", "Alabama", "Calabash", "Washington")
pattern <- "[Aa][^a]*a" # sequences starting with A or a and continuing up to next a.
regmatches(x, regexpr(pattern, x))
## [1] "Arka" "Ala"  "ala"
regmatches(x, regexpr(pattern, x), invert = TRUE)
## [[1]]
## [1] ""     "nsas"
## 
## [[2]]
## [1] ""     "bama"
## 
## [[3]]
## [1] "C"    "bash"
## 
## [[4]]
## [1] "Washington"

Extract Patterns (2)

x <- c("Arkansas", "Alabama", "Calabash", "Washington")
pattern <- "[Aa][^a]*a" # sequences starting with A or a and continuing up to next a.
regmatches(x, gregexpr(pattern, x)) 
## [[1]]
## [1] "Arka"
## 
## [[2]]
## [1] "Ala" "ama"
## 
## [[3]]
## [1] "ala"
## 
## [[4]]
## character(0)
regmatches(x, gregexpr(pattern, x), invert = TRUE) 
## [[1]]
## [1] ""     "nsas"
## 
## [[2]]
## [1] ""  "b" "" 
## 
## [[3]]
## [1] "C"    "bash"
## 
## [[4]]
## [1] "Washington"

So what are regular expressions?

Forming regular expressions

String functions and patterns

Common Regex tasks

Regular expression syntax

Basic exact character matching

require(stringr)
x <- c("apple", "banana", "pear")
str_extract(x, "an")
## [1] NA   "an" NA
bananas <- c("banana", "Banana", "BANANA")
str_detect(bananas, "banana")
## [1]  TRUE FALSE FALSE
str_detect(bananas, regex("banana", ignore_case = TRUE))
## [1] TRUE TRUE TRUE

The str_whatever functions

require(stringr)
example.obj <- "1. A small sentence. - 2. Another tiny sentence."
str_extract(example.obj, "e")
## [1] "e"
str_extract_all(example.obj, "e")
## [[1]]
## [1] "e" "e" "e" "e" "e" "e" "e"

str_extract vs grep

x <- c("apple", "banana", "pear")
str_extract(x, "an")
## [1] NA   "an" NA
grep("an", x, value = TRUE)
## [1] "banana"
str_subset(x,"an")
## [1] "banana"

Refining the search for a character:
Specifying location

x <- c("apple", "banana", "pear")
str_detect(x, "^a")
## [1]  TRUE FALSE FALSE

Syntax (1) More about character matching: wildcards

x <- c("apple", "banana", "pear")
str_extract(x, ".a.")
## [1] NA    "ban" "ear"
str_detect("\nX\n", ".X.")
## [1] FALSE
str_detect("\nX\n", regex(".X.", dotall = TRUE))
## [1] TRUE
example.obj <- "1. A small sentence. - 2. Another tiny sentence."
str_extract(example.obj, "sm.ll")
## [1] "small"
example.obj.2 <- "The cat sat on the mat"
str_extract(example.obj.2, ".at")
## [1] "cat"
str_extract_all(example.obj.2, ".at")
## [[1]]
## [1] "cat" "sat" "mat"

Syntax (2): Escape sequences

str_extract(c("abc", "a.c", "bef"), "a\\.c")
## [1] NA    "a.c" NA
x <- "a\\b" # There is only one \
writeLines(x)
## a\b
str_extract(x, "\\\\")
## [1] "\\"

Syntax (3): More on Escape sequences

Example - Let’s say you specify your pattern with single quotes and you want to find countries with the single quote “’”. - You would have to “escape” the single quote in the pattern, by preceding it with “\”, so it’s clear it is not part of the string-specifying machinery:

library(XML)
library(RCurl)
url <- getURL("https://www.nationsonline.org/oneworld/countries_of_the_world.htm")
df <- readHTMLTable(url, header = T)
countries <- c(levels(df[[1]]$V2),levels(df[[2]]$V2),levels(df[[3]]$V2),
               levels(df[[4]]$V2),levels(df[[5]]$V2))
grep("\'", countries, value = TRUE)
## [1] "CĂ´te D'ivoire (Ivory Coast)"                  
## [2] "Korea, Democratic People's Rep. (North Korea)"
## [3] "Lao, People's Democratic Republic"

Syntax (4): Quantifiers

Syntax (4): Quantifiers (examples)

(strings <- c("a", "ab", "acb", "accb", "acccb", "accccb"))
grep("ac*b", strings, value = TRUE) # "ab"     "acb"    "accb"   "acccb"  "accccb"
grep("ac+b", strings, value = TRUE) # "acb"    "accb"   "acccb"  "accccb"
grep("ac?b", strings, value = TRUE) # "ab"  "acb"
grep("ac{2}b", strings, value = TRUE) # "accb"
grep("ac{2,}b", strings, value = TRUE) # "accb"   "acccb"  "accccb"
grep("ac{2,3}b", strings, value = TRUE) # "accb"  "acccb"

Exercise

Find all countries with ee in their name using quantifiers.

## [1] "Cocos (Keeling) Islands"       "Greece"                       
## [3] "Greenland"                     "Holy See"                     
## [5] "Vatican City State (Holy See)"

Syntax (5): Position of pattern within the string:
anchors

(strings <- c("abcd", "cdab", "cabd", "c abd"))
grep("ab", strings, value = TRUE)
grep("^ab", strings, value = TRUE)
grep("ab$", strings, value = TRUE)
grep("\\bab", strings, value = TRUE)
str_replace_all("The quick brown fox", "\\b", "_")
## [1] "_The_ _quick_ _brown_ _fox_"
str_replace_all("The quick brown fox", "\\B", "_")
## [1] "T_h_e q_u_i_c_k b_r_o_w_n f_o_x"

Exercises

Find the countries that end up with land.

##  [1] "Christmas Island" "Finland"          "Greenland"        "Iceland"         
##  [5] "Ireland"          "New Zealand"      "Pitcairn Island"  "Poland"          
##  [9] "Reunion Island"   "Swaziland"        "Switzerland"      "Thailand"

Find the countries that have the word and in their name.

## [1] "Antigua and Barbuda"              "Bosnia and Herzegovina"          
## [3] "Saint Kitts and Nevis"            "Saint Vincent and the Grenadines"
## [5] "Sao Tome and Principe"            "Trinidad and Tobago"             
## [7] "Turks and Caicos Islands"         "Wallis and Futuna Islands"

Syntax (6): Operators

Syntax (6): Operators examples

(strings <- c("^ab", "ab", "abc", "abd", "abe", "ab 12", "acb"))
grep("ab.", strings, value = TRUE)
grep("ab[c-e]", strings, value = TRUE)
grep("ab[^c]", strings, value = TRUE)
grep("^ab", strings, value = TRUE)
grep("\\^ab", strings, value = TRUE)
grep("abc|abd", strings, value = TRUE)
gsub("(ab) 12", "\\1 34", strings)
url <- getURL("https://en.wikipedia.org/wiki/List_of_culinary_fruits")
df <- readHTMLTable(url, header = T)
fruits <- c(levels(df[[1]]$`Common name`), levels(df[[2]]$`Common name`),
            levels(df[[3]]$`Common name`), levels(df[[4]]$`Common name`),
            levels(df[[5]]$`Common name`), levels(df[[6]]$`Common name`),
            levels(df[[7]]$`Common name`), levels(df[[8]]$`Common name`))
pattern <- "(..)\\1"
str_subset(fruits, pattern)
## [1] "Bolivian mountain coconut" "King coconut"             
## [3] "Sea coconut"               "Salal"                    
## [5] "Cassabanana"               "Banana"

Exercise

Find countries with letter i or t, and ends with land, and replace land with LAND.

## [1] "Christmas IsLAND" "FinLAND"          "IceLAND"          "IreLAND"         
## [5] "Pitcairn IsLAND"  "Reunion IsLAND"   "SwaziLAND"        "SwitzerLAND"     
## [9] "ThaiLAND"

Syntax (7): Character classes

More character classes

Note:
* [:...:] has to be used inside square brackets, e.g. [[:digit:]].
* \ itself is a special character that needs escape, e.g. \\d. Do not confuse these regular expressions with R escape sequences such as \t.

General modes for patterns

Functions in the stringr package

Functions in the stringr package

Functions in the stringr package

Functions in stringr vs in functions in base R

Functions in stringr vs in functions in base R

Functions in stringr vs in functions in base R

Resources