Thetazero Pubs
milica micicdata plotting plot words word ggplot libraries library news grams frequency

Codes for the analysis

## Load the necessary libraries
library(ngram)
library(tm)
library(RWeka)
library(tau)
## Read in the data
con <- file("./final/en_US/en_US.twitter.txt", "r")
tweets <- readLines(con)    # read in twitter data
close(con)

con <- file("./final/en_US/en_US.blogs.txt", "r")
blogs <- readLines(con)     # read in blogs data
close(con)
   
con <- file("./final/en_US/en_US.news.txt", "r")
news <- readLines(con)      # read in news data 
close(con)
lineCount <- c(length(tweets), length(blogs), length(news))             # Number of lines
wordCount <- c(wordcount(tweets), wordcount(blogs), wordcount(news))    # Number of words
set.seed(53135)

# select a random 10% of the lines from eash dataset
tweets <- tweets[rbinom(length(tweets)*.10, length(tweets), .5)]
blogs <- blogs[rbinom(length(blogs)*.10, length(blogs), .5)]
news <- news[rbinom(length(news)*.10, length(news), .5)]

# combine data samples into one dataset
data <- c(tweets, blogs, news)
## Cleaning the dataset
data <- gsub("[`?]", "'", data)                         # replace curly and back apostrophe with '
data <- gsub("[^a-zA-Z' ]|(^'+)|('+$)|'{2,}", "", data) # remove all characters that are not letters, single space 
                                                        # or apostrophe (except when at the beginning or end of the 
                                                        # line and when more than 2 together )
data <- gsub("( +'+)|('+ +('+)?)", " ", data)           # replace special cases of apostrophe positions with blank space
profanity <- readLines("profanity.txt")                 # read in the list of profanity words
data <- removeWords(data, profanity)                    # remove profanity words from the dataset
data <- stripWhitespace(data)                           # remove extra whitespace from the dataset
## Identify n-grams
allwords <- textcnt(data, method = "string", n=1, split = "[[:space:]]+", decreasing=TRUE)
bigrams <- textcnt(data, method = "string", n=2, split = "[[:space:]]+", decreasing=TRUE)
threegrams <- textcnt(data, method = "string", n=3, split = "[[:space:]]+", decreasing=TRUE)
## Remove stop words with "SMART" list (extended "en" list)
stopwordsFree <- removeWords(tolower(data), sort(stopwords("SMART"), decreasing=TRUE))
## Identify n-grams without stop words
words <- textcnt(stopwordsFree, method = "string", n=1, split = "[[:space:]]+", decreasing=TRUE)
bigramStopFree <- textcnt(stopwordsFree, method = "string", n=2, split = "[[:space:]]+", decreasing=TRUE)
threegramStopFree <- textcnt(stopwordsFree, method = "string", n=3, split = "[[:space:]]+", decreasing=TRUE)

Codes for plotting

# Load the necessary plotting libraries
library(ggplot2)
library(grid)
library(scales)
library(gridExtra)
## Plot basic summary of the text files
stats <- data.frame("files" = rep(c("Twitter", "Blogs", "News"),2), 
                    "Count" = c(lineCount, wordCount), 
                    "Case" = c(rep("Line Count",3), rep("Word Count",3)))
ggplot(stats, aes(x = files, y = Count)) + 
    geom_bar(stat = "identity", fill="#FF9999", color="black", position="dodge") + 
    facet_grid(Case ~ ., scales="free_y") + xlab("") + ylab("Total No.") +
    ggtitle("Fig.1: Basic Summary of the Text Sources") +
    scale_y_continuous(labels = comma)
## Plot the 20 most frequent n-grams

# Left (words)
allwordsStats <- data.frame("onegrams" = factor(names(allwords[1:20]), levels=names(allwords[1:20])), 
                            "frequency" = allwords[1:20])
p1 <- ggplot(allwordsStats, aes(x=onegrams, y=frequency)) + 
    xlab("") + ylab("Frequency") + ggtitle("1-grams") +
    geom_segment(aes(xend=onegrams, yend=0), color="darkblue") +  
    geom_point(size=3, color="red") + coord_flip() +                
    scale_y_continuous(labels = comma) + 
    theme(plot.margin=unit(c(1,3.5,1,0), "lines"), plot.title=element_text(size=11),
          axis.title.x=element_text(size=11), axis.text.x=element_text(angle=45))

# Middle (2-grams)
bigramStats <- data.frame("bigrams" = factor(names(bigrams[1:20]), levels=names(bigrams[1:20])), 
                          "frequency" = bigrams[1:20])
p2 <- ggplot(bigramStats, aes(x=bigrams, y=frequency)) + 
    xlab("") + ylab("Frequency") + ggtitle("2-grams") +
    geom_segment(aes(xend=bigrams, yend=0), color="darkblue") +
    geom_point(size=3, color="red") + coord_flip() +
    scale_y_continuous(labels = comma) + 
    theme(plot.margin=unit(c(1,2.5,1,0), "lines"), plot.title=element_text(size=11),
          axis.title.x=element_text(size=11), axis.text.x=element_text(angle=45))

# Right (3-grams)
threegramStats <- data.frame("threegrams" = factor(names(threegrams[1:20]), levels=names(threegrams[1:20])), 
                             "frequency" = threegrams[1:20])
p3 <- ggplot(threegramStats, aes(x=threegrams, y=frequency)) + 
    xlab("") + ylab("Frequency") + ggtitle("3-grams") +
    geom_segment(aes(xend=threegrams, yend=0), color="darkblue") +
    geom_point(size=3, color="red") + coord_flip() +
    scale_y_continuous(labels = comma) + 
    theme(plot.margin=unit(c(1,0,1,0), "lines"), plot.title=element_text(size=11),
          axis.title.x=element_text(size=11), axis.text.x=element_text(angle=45))

# ggplot multiplot
grid.arrange(p1, p2, p3, ncol = 3, main = "Fig.2: Most Frequent N-grams in the Sample")
## Plot the 20 most frequent n-grams without stop words

# Left (words)
wordStats <- data.frame("onegrams" = factor(names(words[1:20]), levels=names(words[1:20])), 
                        "frequency" = words[1:20])
p4 <- ggplot(wordStats, aes(x=onegrams, y=frequency)) + 
    xlab("") + ylab("Frequency") + ggtitle("1-grams") +
    geom_segment(aes(xend=onegrams, yend=0), color="darkblue") +
    geom_point(size=3, color="red") + coord_flip() +
    scale_y_continuous(labels = comma) + 
    theme(plot.margin=unit(c(1,4,1,0), "lines"), plot.title=element_text(size=11),
          axis.title.x=element_text(size=11), axis.text.x=element_text(angle=45),
          axis.text.y=element_text(size=9))

# Middle (2-grams)
bigramStopStats <- data.frame("bigrams" = factor(names(bigramStopFree[1:20]), levels=names(bigramStopFree[1:20])), 
                              "frequency" = bigramStopFree[1:20])
p5 <- ggplot(bigramStopStats, aes(x=bigrams, y=frequency)) + 
    xlab("") + ylab("Frequency") + ggtitle("2-grams") +
    geom_segment(aes(xend=bigrams, yend=0), color="darkblue") +
    geom_point(size=3, color="red") + coord_flip() +
    scale_y_continuous(labels = comma) + 
    theme(plot.margin=unit(c(1,1.8,1,0), "lines"), plot.title=element_text(size=11),
          axis.title.x=element_text(size=11), axis.text.x=element_text(angle=45),
          axis.text.y=element_text(size=9))

# Right (3-grams)
threegramStopStats <- data.frame("threegrams" = factor(names(threegramStopFree[1:20]), levels=names(threegramStopFree[1:20])), 
                                 "frequency" = threegramStopFree[1:20])
p6 <- ggplot(threegramStopStats, aes(x=threegrams, y=frequency)) + 
    xlab("") + ylab("Frequency") + ggtitle("3-grams") +
    geom_segment(aes(xend=threegrams, yend=0), color="darkblue") +
    geom_point(size=3, color="red") + coord_flip() +
    scale_y_continuous(labels = comma) + 
    theme(plot.margin=unit(c(1,0,1,0), "lines"), plot.title=element_text(size=11),
          axis.title.x=element_text(size=11), axis.text.x=element_text(angle=45),
          axis.text.y=element_text(size=9))

# ggplot multiplot
grid.arrange(p4, p5, p6, ncol = 3, main = "Fig.3: Most Frequent Context N-grams in the Sample")
Copyright © 2016 thetazero.com All Rights Reserved. Privacy Policy