library(tidyverse)
library(ggplot2)
library(gridExtra)

#' input:
#'   network.csv (concept, id)
#' output:
#'   network-by-concepts.csv (concept, count, ids)
#'   network-by-record.csv (id, count, concepts)
#'   network-statistics.csv (type, total, single, multi)
#'
#' In RStudio you can run this script in the console:
#' system("Rscript scripts/network-transform.R szte")

args = commandArgs(trailingOnly=TRUE)
if (length(args) == 0) {
  stop("At least one argument must be supplied (input file).n", call.=FALSE)
} else if (length(args) == 1) {
  # default output file
  output_dir <- args[1]
}
output_dir <- '~/bin/marc/_output/gent/network-scores2'

prefix <- 'network-scores-qlinkabs-histogram'
csv <- sprintf("%s/%s.csv", output_dir, prefix)
if (!file.exists(csv)) {
  stop(paste("input file", csv, "does not exist!"))
}
df <- read_csv(csv)
# df <- read_csv('~/bin/marc/_output/gent/network-scores2/network-scores-qlinkabs-histogram.csv')

tags <- df %>% 
  # filter(tag != 'all') %>% 
  group_by(tag) %>% 
  summarise(total = sum(count_n)) %>% 
  arrange(desc(total)) %>% 
  slice_max(total, n = 12) %>% 
  select(tag) %>% 
  unlist(use.names = FALSE)
tags

df2 <- df %>% 
  filter(tag %in% tags)

max_count <- max(df2$count_n)

df4 <- data.frame(tag=tags)
for (tag_name in tags) {
  df3 <- filter(df2, tag==tag_name)
  abs <- c()
  for (i in count(df3)) {
    abs <- c(abs, rep(df3$qlinkAbs, df3$count_n))
  }
  df4$n[df4$tag == tag_name] <- length(abs)
  df4$mean[df4$tag == tag_name] <- mean(abs)
  df4$sd[df4$tag == tag_name] <- sd(abs)
  df4$min[df4$tag == tag_name] <- min(df3$qlinkAbs)
  df4$max[df4$tag == tag_name] <- max(df3$qlinkAbs)
  df4$percent[df4$tag == tag_name] <- 
    sprintf(
      '%.2f%%', 
      length(abs[abs > mean(abs) + sd(abs)]) * 100 / length(abs)
    )
}
df4$bottom = ifelse(df4$mean - df4$sd >= df4$min, df4$mean - df4$sd, df4$min)
df4$top = ifelse(df4$mean + df4$sd <= df4$max, df4$mean + df4$sd, df4$max)
df4

p <- df2 %>%
  ggplot() +
  xlab('Connectedness (Qlink)') +
  ylab('number of records') +
  ggtitle('Comparision of MARC tags how strongly they connect records') +
  facet_wrap(~ tag, ncol=3) +
  geom_point(aes(x = qlinkAbs, y = count_n), size=0.01) +
  scale_x_log10() +
  scale_y_log10() +
  geom_vline(data=filter(df4, tag==tag), aes(xintercept=mean), colour="red") +
  geom_vline(data=filter(df4, tag==tag), aes(xintercept=top), colour="green") +
  geom_vline(data=filter(df4, tag==tag), aes(xintercept=max), colour="blue") +
  geom_text(
    data=filter(df4, tag==tag),
    aes(x = mean * 0.9, y = max_count * 0.8, label = sprintf('%.4f', mean), hjust='right'),
    colour = 'red') + 
  geom_text(data=filter(df4, tag==tag),
    aes(
      x = max * 0.9, y = max_count * 0.8,
      label = percent,
      hjust = 'right'), colour = 'blue') +
  geom_segment(data=filter(df4, tag==tag),
    aes(x = top, xend = max, 
        y = max_count * 0.1, yend = max_count * 0.1),
    colour = 'blue')
p

prefix <- 'network-scores-degrees'
csv <- sprintf("%s/%s.csv", output_dir, prefix)
if (!file.exists(csv)) {
  stop(paste("input file", csv, "does not exist!"))
}
df_degree <- read_csv(csv)

# df_degree <- read_csv('~/bin/marc/_output/gent/network-scores2/network-scores-degrees.csv')

df_hirsch <- df_degree %>% 
  group_by(tag) %>% 
  select(degree) %>% 
  arrange(desc(degree)) %>% 
  mutate(n = seq(1, n())) %>% 
  filter(n >= degree) %>% 
  filter(n == min(n)) %>% 
  select(tag, degree) %>% 
  mutate(
    tag = factor(tag)
  ) %>% 
  ungroup()

# TODO save hirsch
write_csv(df_hirsch, sprintf('%s/hirsch-index.csv', output_dir))

img_hirsch <- df_hirsch %>% 
  ggplot(aes(x = degree, y = fct_rev((tag)))) + 
  geom_col() +
  ggtitle(
    "Hirsch-index of MARC tags",
    subtitle = 'There are at least x number of records which has x links'
  ) +
  ylab('MARC tag') +
  xlab('h-index') +
  theme_bw()

img_path <- sprintf("%s/%s.png", output_dir, 'hirsch')
ggsave(img_hirsch, device="png", filename=img_path, width=5, height=5)
print(paste('creating', img_path))

tags <- df_degree %>% 
  # filter(tag != 'all') %>% 
  group_by(tag) %>% 
  summarise(total = sum(degree)) %>% 
  arrange(desc(total)) %>% 
  slice_max(total, n = 20) %>% 
  select(tag) %>% 
  unlist(use.names = FALSE)

p_degree <- df_degree %>%
  filter(tag %in% tags) %>% 
  ggplot(aes(tag, degree)) +
  geom_boxplot() +
  scale_y_log10() +
  # xlab('MARC21 tags') +
  ylab('Degree') +
  ggtitle('Comparision of MARC tags how strongly they connect records')

prefix <- 'network-scores-pagerank'
csv <- sprintf("%s/%s.csv", output_dir, prefix)
if (!file.exists(csv)) {
  stop(paste("input file", csv, "does not exist!"))
}
df_pr <- read_csv(csv)
# df_pr <- read_csv('~/bin/marc/_output/gent/network-scores2/network-scores-pagerank.csv')

p_pr <- df_pr %>%
  filter(tag %in% tags) %>% 
  ggplot(aes(tag, score)) +
  geom_boxplot() +
  scale_y_log10() +
  # xlab('MARC21 tags') +
  ylab('PageRank')
  # ggtitle('Comparision of MARC tags how strongly they connect records')

df_components <- read_csv('~/bin/marc/_output/gent/network-scores2/network-scores-components.csv')
df_components
df_xy <- df_components %>% 
  filter(tag == 'all') %>% 
  group_by(tag, size) %>% 
  summarise(count = n()) %>% 
  ungroup() %>% 
  select(size, count)

df_xy

df_l <- df_xy %>% 
  mutate(
    x = size,
    y = count
  ) %>% 
  filter(x < 1000) %>% 
  select(x, y)

exponential.model <- lm(y ~ x, df_l)
exponential.model
exponential.model$coefficients
exponential.model$residuals
exponential.model$effects
exponential.model$rank
exponential.model$fitted.values
exponential.model$assign
exponential.model$



x <- predict(exponential.model, interval = 'prediction')
df_l$fit <- as.vector(x[,'fit'])


df_l$fit2 <-  (df_l$x^-3.58)
df_l$fit2 <- as.vector(exponential.model$fitted.values)

View(df_l)

df_l %>% 
  ggplot(aes(x, y)) +
  geom_point() +
  geom_point(aes(x, fit), color='red') +
  geom_point(aes(x, fit2), color='orange') +
  stat_smooth(formula = y ~ x) +
  scale_x_log10() +
  scale_y_log10()
  
ggplot(df_xy) +
  geom_point(aes(x = size, y = count)) +
  scale_x_log10() +
  scale_y_log10()
  
df_xy %>% 
  filter(size < 2000) %>% 
  ggplot() +
  geom_point(aes(x = size, y = count))

exponential.model <- lm(log(x) ~ log(y), df_xy)
summary(model)

df_xy

x <- exp(predict(exponential.model, list(y=df_xy$y)))
unlist(x, use.names = FALSE)

timevalues <- seq(1, 3000, 0.1)
Counts.exponential2 <- exp(predict(exponential.model, list(y=timevalues)))
Counts.exponential2

ggplot(df_xy) +
  geom_point(aes(x = x, y = y))

plot(df_xy$Time, df_xy$Counts, pch=16, xlab = "Time (s)", ylab = "Counts")
lines(timevalues, Counts.exponential2, lwd=2, col = "red")

A <- structure(
  list(
    Time = c(0, 1, 2, 4, 6, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18, 19, 20, 21, 
             22, 24, 25, 26, 27, 28, 29, 30), 
    Counts = c(126.6, 101.8, 71.6, 101.6, 68.1, 62.9, 45.5, 41.9, 46.3, 34.1,
               38.2, 41.7, 24.7, 41.5, 36.6, 19.6, 22.8, 29.6, 23.5, 15.3, 
               13.4, 26.8, 9.8, 18.8, 25.9, 19.3)
  ),
  .Names = c("Time", "Counts"),
  row.names = c(1L, 2L, 3L, 5L, 7L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L,
                19L, 20L, 21L, 22L, 23L, 25L, 26L, 27L, 28L, 29L, 30L, 31L),
  class = "data.frame")

attach(A)
names(A)

exponential.model <- lm(log(Counts)~ Time)
summary(exponential.model)

ggplot(data = A) + 
  geom_point(aes(x = Time, y = Counts))

timevalues <- seq(1, 3000, 0.1)
Counts.exponential2 <- exp(predict(exponential.model, list(Time=timevalues)))
plot(Time, Counts, pch=16, xlab = "Time (s)", ylab = "Counts")
lines(timevalues, Counts.exponential2, lwd=2, col = "red")



top_tags <- tags[1:12]
df4 <- data.frame(tag=top_tags)
for (tag_name in top_tags) {
  df3 <- df_components %>% filter(tag==tag_name)
  df4$n[df4$tag == tag_name] <- length(df3$size)
  df4$sum[df4$tag == tag_name] <- sum(df3$size)
  df4$mean[df4$tag == tag_name] <- mean(df3$size)
  df4$sd[df4$tag == tag_name] <- sd(df3$size)
  df4$min[df4$tag == tag_name] <- min(df3$size)
  df4$max[df4$tag == tag_name] <- max(df3$size)
}
df4$bottom = ifelse(df4$mean - df4$sd >= df4$min, df4$mean - df4$sd, df4$min)
df4$top = ifelse(df4$mean + df4$sd <= df4$max, df4$mean + df4$sd, df4$max)

plot_clusters <- df_components %>% 
  filter(tag %in% top_tags) %>% 
  group_by(tag, size) %>% 
  summarise(count = n()) %>% 
  ungroup() %>% 
  ggplot() +
  xlab('cluster size') +
  ylab('number of clusters') +
  ggtitle('Comparision of MARC tags how strongly they connect records') +
  facet_wrap(~ tag, ncol=4) +
  geom_point(aes(x = size, y = count), size=0.01) +
  scale_x_log10(
    breaks = c(10^2, 10^4, 10^6),
    labels = c(expression(10^2),
               expression(10^4),
               expression(10^6))
  ) +
  scale_y_log10(
    breaks = c(10^1, 10^3, 10^5),
    labels = c(expression(10^1),
               expression(10^3),
               expression(10^5))
  ) +
  geom_vline(data=filter(df4, tag==tag), aes(xintercept=mean), colour='#666666') +
  # geom_vline(data=filter(df4, tag==tag), aes(xintercept=top), colour='#666666') +
  geom_vline(data=filter(df4, tag==tag), aes(xintercept=max), colour='#666666') +
  geom_text(
    data=filter(df4, tag==tag),
    aes(x = mean * 0.9, y = max_count * 0.8,
        label = sprintf('mean\n%.1f', mean),
        hjust='right', vjust='top'),
    colour = '#666666') +
  geom_text(
    data=filter(df4, tag==tag),
    aes(x = max * 0.9, y = max_count * 0.8,
        label = sprintf('max: %d\n%.1f%%', max, max * 100 / sum),
        hjust='right', vjust='top'),
    colour = '#666666')

img_path <- sprintf("%s/%s.png", output_dir, 'clusters')
ggsave(plot_clusters, device="png", filename=img_path, width=10, height=5)
print(paste('creating', img_path))

#  geom_text(
#    data=filter(df4, tag==tag),
#    aes(x = mean * 0.9, y = max_count * 0.8, label = sprintf('%.4f', mean), hjust='right'),
#    colour = 'red') + 
#  geom_text(data=filter(df4, tag==tag),
#            aes(
#              x = max * 0.9, y = max_count * 0.8,
#              label = percent,
#              hjust = 'right'), colour = 'blue') +
#  geom_segment(data=filter(df4, tag==tag),
#               aes(x = top, xend = max, 
#                   y = max_count * 0.1, yend = max_count * 0.1),
#               colour = 'blue')

p_components <- df_components %>%
  filter(tag %in% tags) %>% 
  ggplot(aes(tag, size)) +
  geom_boxplot() +
  scale_y_log10() +
  xlab('MARC21 tags') +
  ylab('Connected components')
  # ggtitle('Comparision of MARC tags how strongly they connect records')

p_degree

all_network <- grid.arrange(p_degree, p_pr, p_components, ncol = 1)

img_path <- sprintf("%s/%s.png", output_dir, 'network')
ggsave(all_network, device="png", filename=img_path, width=10, height=5)
print(paste('creating', img_path))
