library(data.table)
library(dplyr)
library(anytime)
library(tidytext)
library(textclean)
library(stopwords)
library(udpipe)
library(lubridate)
library(ngram)
load('DAT_dfudpipe.RData')
df <- df %>%
mutate(textid = paste(lower, doc_id, paragraph_id, sentence_id, sep = '_'))
all_ids <- sort(unique(df$textid))
cntr <- 1
for(i in all_ids){
tmpdf <- df %>% filter(textid == i) %>%
filter(!grepl('-', token_id))
if(nrow(tmpdf) >= 10){
if(grepl('VerbForm\\=Fin', paste(tmpdf$feats, collapse = ' '))){
print(paste0(cntr, '/', length(all_ids)))
newdf <- data.frame(first_token = tmpdf$token[1:(nrow(tmpdf)-1)], last_token = tmpdf$token[2:nrow(tmpdf)],
first_lemma = tmpdf$lemma[1:(nrow(tmpdf)-1)], last_lemma = tmpdf$lemma[2:nrow(tmpdf)],
first_upos = tmpdf$upos[1:(nrow(tmpdf)-1)], last_upos = tmpdf$upos[2:nrow(tmpdf)],
first_feats = tmpdf$feats[1:(nrow(tmpdf)-1)], last_feats = tmpdf$feats[2:nrow(tmpdf)],
first_dep_rel = tmpdf$dep_rel[1:(nrow(tmpdf)-1)], last_dep_rel = tmpdf$dep_rel[2:nrow(tmpdf)],
first_misc = tmpdf$misc[1:(nrow(tmpdf)-1)], last_misc = tmpdf$misc[2:nrow(tmpdf)])
df_bigram <- tmpdf %>%
select(lower:textid, sentence) %>%
slice(1) %>%
slice(rep(1:n(), each = nrow(newdf))) %>%
bind_cols(newdf) %>%
mutate(n = 1:n()) %>%
mutate(ngram = 'two')
#Trigram
newdf_three <- data.frame(first_token = tmpdf$token[1:(nrow(tmpdf)-2)],
mid_token = tmpdf$token[2:(nrow(tmpdf)-1)], last_token = tmpdf$token[3:nrow(tmpdf)],
first_lemma = tmpdf$lemma[1:(nrow(tmpdf)-2)],
mid_lemma = tmpdf$lemma[2:(nrow(tmpdf)-1)], last_lemma = tmpdf$lemma[3:nrow(tmpdf)],
first_upos = tmpdf$upos[1:(nrow(tmpdf)-2)],
mid_upos = tmpdf$upos[2:(nrow(tmpdf)-1)], last_upos = tmpdf$upos[3:nrow(tmpdf)],
first_feats = tmpdf$feats[1:(nrow(tmpdf)-2)],
mid_feats = tmpdf$feats[2:(nrow(tmpdf)-1)], last_feats = tmpdf$feats[3:nrow(tmpdf)],
first_dep_rel = tmpdf$dep_rel[1:(nrow(tmpdf)-2)],
mid_dep_rel = tmpdf$dep_rel[2:(nrow(tmpdf)-1)], last_dep_rel = tmpdf$dep_rel[3:nrow(tmpdf)],
first_misc = tmpdf$misc[1:(nrow(tmpdf)-2)],
mid_misc = tmpdf$misc[2:(nrow(tmpdf)-1)], last_misc = tmpdf$misc[3:nrow(tmpdf)])
df_trigram <- tmpdf %>%
select(lower:textid, sentence) %>%
slice(1) %>%
slice(rep(1:n(), each = nrow(newdf_three))) %>%
bind_cols(newdf_three) %>%
mutate(n = 1:n()) %>%
mutate(ngram = 'three')
#Fourgram
newdf_four <- data.frame(first_token = tmpdf$token[1:(nrow(tmpdf)-3)],
mid_token = tmpdf$token[2:(nrow(tmpdf)-2)],
pen_token = tmpdf$token[3:(nrow(tmpdf)-1)], last_token = tmpdf$token[4:nrow(tmpdf)],
first_lemma = tmpdf$lemma[1:(nrow(tmpdf)-3)],
mid_lemma = tmpdf$lemma[2:(nrow(tmpdf)-2)],
pen_lemma = tmpdf$lemma[3:(nrow(tmpdf)-1)], last_lemma = tmpdf$lemma[4:nrow(tmpdf)],
first_upos = tmpdf$upos[1:(nrow(tmpdf)-3)],
mid_upos = tmpdf$upos[2:(nrow(tmpdf)-2)],
pen_upos = tmpdf$upos[3:(nrow(tmpdf)-1)], last_upos = tmpdf$upos[4:nrow(tmpdf)],
first_feats = tmpdf$feats[1:(nrow(tmpdf)-3)],
mid_feats = tmpdf$feats[2:(nrow(tmpdf)-2)],
pen_feats = tmpdf$feats[3:(nrow(tmpdf)-1)], last_feats = tmpdf$feats[4:nrow(tmpdf)],
first_dep_rel = tmpdf$dep_rel[1:(nrow(tmpdf)-3)],
mid_dep_rel = tmpdf$dep_rel[2:(nrow(tmpdf)-2)],
pen_dep_rel = tmpdf$dep_rel[3:(nrow(tmpdf)-1)], last_dep_rel = tmpdf$dep_rel[4:nrow(tmpdf)],
first_misc = tmpdf$misc[1:(nrow(tmpdf)-3)],
mid_misc = tmpdf$misc[2:(nrow(tmpdf)-2)],
pen_misc = tmpdf$misc[3:(nrow(tmpdf)-1)], last_misc = tmpdf$misc[4:nrow(tmpdf)])
df_fourgram <- tmpdf %>%
select(lower:textid, sentence) %>%
slice(1) %>%
slice(rep(1:n(), each = nrow(newdf_four))) %>%
bind_cols(newdf_four) %>%
mutate(n = 1:n()) %>%
mutate(ngram = 'four')
#save files
save(df_bigram, file = paste0('./outputs/bigram_', i, '.RData'))
save(df_trigram, file = paste0('./outputs/trigram_', i, '.RData'))
save(df_fourgram, file = paste0('./outputs/fourgram_', i, '.RData'))
# if(i == all_ids[1]){
# dat_bigram <- df_bigram
# dat_trigram <- df_trigram
# dat_fourgram <- df_fourgram
# }else{
# dat_bigram <- rbind(dat_bigram, df_bigram)
# dat_trigram <- rbind(dat_trigram, df_trigram)
# dat_fourgram <- rbind(dat_fourgram, df_fourgram)
# }
}
}
cntr <- cntr + 1
}