Jump to content

User:DCausse/Completion Suggester And Pageviews

From Wikitech
The printable version is no longer supported and may have rendering errors. Please update your browser bookmarks and please use the default browser print function instead.

Dump that contains all variable needed to compute the score on enwiki : stats_enwiki_with_pv.csv.gz

Small R script to experiment with the score

dat = read.csv("stats_enwiki_with_pv.csv", header=TRUE, sep=",")

pop_score <- function(maxDocs, popScore) {
  POPULARITY_MAX = 0.0004;
  
  if( popScore > POPULARITY_MAX ) {
    return ( 1 );
  }
  # broken log scale with maxDocs...
  # working on ratio (pv/total_project_pv) is difficult here, I was wrong :(
  pop = logb(1+(popScore*maxDocs), 1+(POPULARITY_MAX*maxDocs));
  return (pop);
}

# Score function
popqual_score <- function(maxDocs, incomingLinksRaw, externalLinksRaw, bytesRaw, headingsRaw, redirectsRaw, tmplBoost, popScore) {
  QSCORE_WEIGHT = 1;
  POPULARITY_WEIGHT = 0.4;

  SCORE_RANGE = 10000000;

  qual <- qual_score(maxDocs, incomingLinksRaw, externalLinksRaw, bytesRaw, headingsRaw, redirectsRaw, tmplBoost);
  pop <- pop_score(maxDocs, popScore);
  score = (qual * QSCORE_WEIGHT + pop * POPULARITY_WEIGHT) / (QSCORE_WEIGHT + POPULARITY_WEIGHT)
  
  return (score * SCORE_RANGE);
}

qual_score <- function(maxDocs, incomingLinksRaw, externalLinksRaw, bytesRaw, headingsRaw, redirectsRaw, tmplBoost) {
  INCOMING_LINKS_MAX_DOCS_FACTOR = 1/10;

  EXTERNAL_LINKS_NORM = 20;
  PAGE_SIZE_NORM = 50000;
  HEADING_NORM = 20;
  REDIRECT_NORM = 30;
  
  INCOMING_LINKS_WEIGHT = 0.6;
  EXTERNAL_LINKS_WEIGHT = 0.1;
  PAGE_SIZE_WEIGHT = 0.1;
  HEADING_WEIGHT = 0.2;
  REDIRECT_WEIGHT = 0.1;

#  OUTGOING_LINK_BLANCE = 2000;
#  HEADINGS_BALANCE = 3000;
#  EXT_LINKS_BALANCE = 2000;
  
  SCORE_RANGE = 10000000;
  # If a page gets linked by more than 1/10 of all pages.
  incLinksNorm <- maxDocs * INCOMING_LINKS_MAX_DOCS_FACTOR;
  
  incLinks <- scoreNormL2(incomingLinksRaw, incLinksNorm)
  pageSize <- scoreNormL2(bytesRaw, PAGE_SIZE_NORM)
  extLinks <- scoreNorm(externalLinksRaw, EXTERNAL_LINKS_NORM)
  headings <- scoreNorm(headingsRaw, HEADING_NORM)
  redirects <- scoreNorm(redirectsRaw, REDIRECT_NORM)

  score <- incLinks * INCOMING_LINKS_WEIGHT;
  score <- score + extLinks * EXTERNAL_LINKS_WEIGHT;
  score <- score + pageSize * PAGE_SIZE_WEIGHT;
  score <- score + headings * HEADING_WEIGHT;
  score <- score + redirects * REDIRECT_WEIGHT;
  
  score <- score / (INCOMING_LINKS_WEIGHT + EXTERNAL_LINKS_WEIGHT + PAGE_SIZE_WEIGHT + HEADING_WEIGHT + REDIRECT_WEIGHT);
  
  
  # headingsBalance <- bytesRaw / headingsRaw;
  # headingDistance <- HEADINGS_BALANCE - headingsBalance;

  # extLinksBalance <- externalLinksRaw / bytesRaw;
  # extLinksDistance <- EXT_LINKS_BALANCE - headingsBalance;
  
  #score <- score * (1-1/abs(headingDistance))
  #score <- score * (1-1/abs(extLinksDistance))
  score <- boost(score, tmplBoost);
  return (score);
}

# log2(value/norm + 1)
scoreNormL2 <- function(value, norm) {
  if(value > norm) {
    value <- norm;
  }
  return(log2((value/norm) +1));
}

# simple ratio
scoreNorm <- function(value, norm) {
  if(value > norm) {
    value <- norm;
  }
  return(value/norm);
}

boost <- function(score, boost) {
  if(boost > 1) {
    boost <- 1 - ( 1 / boost );
  } else {
    boost <- - ( 1 - boost );
  }
  if(boost > 0) {
    return (score + ( ( ( 1 - score ) / 2 ) * boost ))
  } else {
    return (score + ( ( score / 2 ) * boost ))
  }
}
# compute the score
dat$score <- mapply(popqual_score, nrow(dat), dat$incomingLinks, dat$externalLinks, dat$bytes, dat$headings, dat$redirects, dat$tmplBoost, dat$pop_score );
dat$score_qual <- mapply(qual_score, nrow(dat), dat$incomingLinks, dat$externalLinks, dat$bytes, dat$headings, dat$redirects, dat$tmplBoost );
dat$score_pop <- mapply(pop_score, nrow(dat), dat$pop_score );


li <- dat[grepl("^Li", dat$page),]
oba_osa <- dat[grepl("^(Barack|Osama)", dat$page),]
peg <- dat[grepl("^Peg", dat$page),]
Po <- dat[grepl("^Po", dat$page),]

bug <- dat[dat$score_qual > 1,]

# various stuff to explore distribution

#quantile(dat$redirects, 0.999);

summary(dat$score_pop)
summary(dat$score_qual)

#plot(density(dat$incomingLinks, from=0.00001), log="x", xlim=c(1,max(dat$incomingLinks)))
#plot(density(dat$externalLinks, from=0.00001), log="x", xlim=c(1,max(dat$externalLinks)))
#plot(density(dat$bytes, from=0.00001), log="x", xlim=c(1,max(dat$bytes)))
#plot(density(dat$headings, from=0.00001), log="x", xlim=c(1,max(dat$headings)))
#plot(density(dat$redirects, from=0.00001), log="x", xlim=c(1,max(dat$redirects)))

#plot(density(dat$incomingLinks, from=0.00001), log="x", xlim=c(1,max(dat$incomingLinks)))

#mysamp <- dat[sample(1:nrow(dat), 100000, replace=FALSE),]

#plot(mysamp$incomingLinks+10, mysamp$score, log="x", cex=0.2)
#plot(mysamp$bytes+10, mysamp$score, log="x", cex=0.2)
#plot(mysamp$headings, mysamp$score, cex=0.2)
#plot(mysamp$redirects, mysamp$score, cex=0.2)
#plot(mysamp$externalLinks+10, mysamp$score,  cex=0.2)



#plot(density(dat$score, from=0.00001), log="x", xlim=c(1,max(dat$score)))


#plot(density(dat$headingBalance))
#plot(density(dat$headings))
#plot(dat$incomingLinks + 10,dat$externalLinks + 10, log="yx", cex=0.2)
#plot(dat$incomingLinks + 10, dat$bytes + 10, log="yx", cex=0.2)
#plot(dat$incomingLinks + 10, dat$headings + 10, log="yx", cex=0.2)
#plot(dat$incomingLinks + 10, dat$headings + 10, log="yx", cex=0.2)
#plot(dat$incomingLinks + 10, dat$redirects + 10, log="yx", cex=0.2)
#plot(dat$score_pop, dat$score, cex=0.2)
#plot(dat$score_qual, dat$score, cex=0.2)