Commit 55336ae5 authored by Erik Hetzner's avatar Erik Hetzner

add c4l-2012 talk

parent 20441f4d
% $Header: /cvsroot/latex-beamer/latex-beamer/solutions/generic-talks/generic-ornate-15min-45min.en.tex,v 1.5 2007/01/28 20:48:23 tantau Exp $
\documentclass[14pt]{beamer}
\usepackage{listings}
\usepackage{graphicx}
\lstdefinelanguage{scala}{
morekeywords={abstract,case,catch,class,def,%
do,else,extends,false,final,finally,%
for,if,implicit,import,match,mixin,%
new,null,object,override,package,%
private,protected,requires,return,sealed,%
super,this,throw,trait,true,try,%
type,val,var,while,with,yield},
otherkeywords={=>,<-,<\%,<:,>:,\#,@},
sensitive=true,
morecomment=[l]{//},
morecomment=[n]{/*}{*/},
morestring=[b]",
morestring=[b]',
morestring=[b]"""
}
\lstdefinelanguage{output}{}
\lstdefinestyle{full}{
frame=trbl,
}
\lstdefinestyle{output}{
frame=tl,
framerule=1pt,
rulecolor=\color{red},
language=output
}
\usepackage{color}
\definecolor{dkgreen}{rgb}{0,0.6,0}
\definecolor{gray}{rgb}{0.5,0.5,0.5}
\definecolor{mauve}{rgb}{0.58,0,0.82}
% Default settings for code listings
\lstset{
frame=none,
language=scala,
aboveskip=3mm,
belowskip=3mm,
showstringspaces=false,
columns=flexible,
basicstyle={\small\ttfamily},
numbers=none,
numberstyle=\tiny\color{gray},
keywordstyle=\color{blue},
commentstyle=\color{dkgreen},
stringstyle=\color{mauve},
breaklines=true,
breakatwhitespace=true
tabsize=3
}
\lstset{language=scala}
\mode<presentation>
{
% \usetheme{Boadilla}
% \usetheme{Warsaw}
\usetheme{AnnArbor}
\setbeamercovered{transparent}
% or whatever (possibly just delete it)
}
\usepackage[english]{babel}
\usepackage[utf8]{inputenc}
%\usepackage{times}
\usepackage[T1]{fontenc}
\usepackage[scaled]{berasans}
\renewcommand*\familydefault{\sfdefault}
\title[Indexing big data]{Indexing big data with Tika, Solr, and map-reduce}
\author{Scott Fisher, Erik Hetzner}
\institute[CDL]{California Digital Library}
\date{8 February 2012}
\subject{Talks}
\begin{document}
\begin{frame}
\titlepage
\end{frame}
%\begin{frame}{Outline}
% \tableofcontents
% % You might wish to add the option [pausesections]
%\end{frame}
\section{Introduction}{}
\begin{frame}{Outline}
\begin{itemize}
\item Introduction
\item Tika
\item Pig
\item Solr
\item Done!
\end{itemize}
\end{frame}
\begin{frame}{Web Archiving Service}
\begin{itemize}
\item Service provided by the California Digital Library
\item Fee-based
\item Archiving web sites,
\item as selected by curators
\end{itemize}
\end{frame}
\begin{frame}{Vital statistics}
\begin{itemize}[<+->]
\item 43 public archives
\item 18 partners
\item 58k crawls, 35k viewable by public
\item 7535 sites
\item 600 million URLs
\item 40+ TB
\end{itemize}
\end{frame}
\begin{frame}{Tools}
\begin{center}
Open source and rails UI for crawl management and display of many focused web crawls. \par
Heritrix - NutchWAX - Wayback\par
\includegraphics[width=4.5in]{was_gradient.jpg}
\end{center}
\end{frame}
\begin{frame}{Nutch search}
\begin{itemize}[<+->]
\item Using Nutch for full text indexing
\item Nutch is slowing down\dots
\item Nutchwax (nutch + web archiving) is no longer supported
\item Nutch search is no longer default with Nutch itself
\item Deduplicating content requires a more sophisticated index.
\end{itemize}
\end{frame}
\begin{section}{Tika}
\begin{frame}{Parsing}
\begin{itemize}[<+->]
\item The web can contain anything.
\item Mostly HTML, but PDFs are very important.
\item Not to mention Office
\end{itemize}
\end{frame}
\begin{frame}{Tika}
\begin{itemize}[<+->]
\item Apache software project
\item Java
\item Wraps parsers for different file types in a uniform interface.
\item Parses most common file types.
\item Use the same code to parse different types.
\end{itemize}
\end{frame}
\begin{frame}{Tika difficulties}
\begin{itemize}[<+->]
\item Some files are slow to parse.
\item Some files blow up your memory.
\item Some file parses never return.
\end{itemize}
\end{frame}
\begin{frame}{Tika solutions}
\begin{itemize}[<+->]
\item Don't parse files that are too big (e.g. $>$ 2 MB)
\item Fork and monitor process from the outside (Hadoop comes in handy)
\end{itemize}
\end{frame}
\end{section}
\begin{section}{Pig}
\begin{frame}{What is Pig?}
\begin{itemize}[<+->]
\item Platform for data analysis from Apache.
\item Based on Hadoop.
\begin{itemize}
\item fault tolerant
\item distributed processing
\end{itemize}
\item Can be used for ad-hoc analysis, without writing Java code.
\item Embraced by the Internet Archive.
\end{itemize}
\end{frame}
\begin{frame}[fragile]
\frametitle{Pig example}
\begin{lstlisting}
Data = LOAD 'arclist' USING
org.cdlib.was.weari.pig.ArchiveURLParserLoader();
STORE Data INTO 'outputdir.json' USING
org.cdlib.was.weari.pig.JsonParsedArchiveRecordStorer();
\end{lstlisting}
\end{frame}
\end{section}
\begin{section}{Lessons learned}
\begin{frame}{Parse once}
\begin{itemize}[<+->]
\item Parse once!
\item Parsing takes forever. Do it once, store the results.
\item Storing raw text is cheap, compared to all those PDFs, HTML, etc.
\end{itemize}
\end{frame}
\begin{frame}{Distribute from the start}
\begin{itemize}[<+->]
\item Use hadoop, pig, or another system to distribute your computing.
\item Don't use an ad-hoc solution. Take the time up front to distribute things.
\end{itemize}
\end{frame}
\end{section}
\begin{section}{Solr}
\begin{frame}{Faceting}
\begin{center}
\includegraphics[width=4.5in]{faceting.jpg}
\end{center}
\end{frame}
\begin{frame}{Faceting 2}
\begin{center}
\includegraphics[width=4.5in]{faceting2.jpg}
\end{center}
\end{frame}
\begin{frame}{Mime types}
\begin{center}
\includegraphics[width=4.5in]{mimetypes.png}
\end{center}
\end{frame}
\begin{frame}{Solr XML}
\begin{center}
\includegraphics[width=4.5in]{some_xml.jpg}
\end{center}
\end{frame}
\end{section}
\begin{frame}{Finale}
\begin{itemize}
\item Be careful when you try to parse at a bunch of files you downloaded from the web.
\item Parse and store.
\item Distribute up front.
\item Build a test index first.
\end{itemize}
\vskip1em
\begin{center}
\tt{http://webarchives.cdlib.org/}\par
\vskip1em
\tt{scott.fisher@ucop.edu, erik.hetzner@ucop.edu}
\end{center}
\end{frame}
\end{document}
\ No newline at end of file
\frametitle {Pig example}
\begin{lstlisting}
Data = LOAD 'arclist' USING
org.cdlib.was.weari.pig.ArchiveURLParserLoader();
STORE Data INTO 'outputdir.json' USING
org.cdlib.was.weari.pig.JsonParsedArchiveRecordStorer();
\end{lstlisting}
This source diff could not be displayed because it is too large. You can view the blob instead.
\documentclass{beamer}
\usepackage{listings}
\usepackage{graphicx}
\usepackage{color}
\definecolor{dkgreen}{rgb}{0,0.6,0}
\definecolor{gray}{rgb}{0.5,0.5,0.5}
\definecolor{mauve}{rgb}{0.58,0,0.82}
\mode<presentation>
{
\usetheme{AnnArbor}
\setbeamercovered{transparent}
}
\usepackage[english]{babel}
\usepackage[utf8]{inputenc}
\usepackage[T1]{fontenc}
\usepackage[scaled]{berasans}
\renewcommand*\familydefault{\sfdefault}
\title[Hack the vote]{A strategy for cod4lib voting, and some suggestions for improvement}
\author{Erik Hetzner}
\institute[CDL]{California Digital Library}
\date{9 Feb 2012}
\subject{Talks}
\begin{document}
\begin{frame}
\titlepage
\end{frame}
\begin{frame}{Multiple winner election}
\begin{itemize}[<+->]
\item Voting for c4l talks is like electing a parliament.
\item `Majoritarian': top-rated talks are chosen
\item with no representation for small parties.
\item Each voter is given unlimited votes, and can assign them 0-3 for each talk.
\end{itemize}
\end{frame}
\begin{frame}{Makes no sense}
\begin{center}
This makes no sense.
\end{center}
\end{frame}
%% * Here are 2 systems that make sense
\begin{frame}{Plurality-at-large}
\begin{itemize}[<+->]
\item Electing $n$ winners.
\item Each voter gets $n$ votes to divide among candidates, 1 per candidate.
\item Highest votes win.
\end{itemize}
\end{frame}
\begin{frame}{Cumulative voting}
\begin{itemize}[<+->]
\item Electing $n$ winners.
\item Each voter gets $n$ votes.
\item Can assign more than one vote to each candidate.
\end{itemize}
\end{frame}
\begin{frame}{Cumulative voting example}
\begin{center}
\includegraphics[width=1.5in]{cumulative.png}
\par
{\small (from Wikipedia)}
\end{center}
\end{frame}
\begin{frame}{c4l voting}
\begin{itemize}[<+->]
\item c4l voting allows 0-3 points for each talk
\item no limit
\end{itemize}
\end{frame}
\begin{frame}{Hacking}
\begin{itemize}[<+->]
\item Assume that as a voter you can provide a ranking of all talks
\item $A < B < C$
\item As a voter, all you care about is seeing the talks you want to see.
\end{itemize}
\end{frame}
\begin{frame}{Strategy}
\begin{itemize}[<+->]
\item Choose your top 22 talks
\item Given them each 3 votes
\item Every other talk gets nothing
\end{itemize}
\end{frame}
% \begin{frame}{Why?}
% \begin{center}
% Assume 10 talks can be elected.\par
% If you give your 10th (A) ranked talk 3 points \par
% and your 11th (B) 2 point.\par\par
% If A had 1 point before you voted, \par
% and B had 3 points, B would win. \par
% But if you gave B 0 points, A would win.
% \end{center}
% \end{frame}
\begin{frame}{What now?}
\begin{center}
If everybody follows this strategy,\par
c4l voting is reduced to plurality-at-large,\par
because everybody will do this.
\end{center}
\end{frame}
\begin{frame}{Fix}
\begin{itemize}
\item Limit points users can assign (to number of candidates)
\item and/or only allow users to give one vote (point) to each
talk
\item Or adopt a proportional representation system.
\end{itemize}
\end{frame}
\begin{frame}{Further reading}
\begin{center}
Szpiro, George. \emph{Numbers Rule: The Vexing Mathematics of
Democracy}. Princeton, 2010.\par\vskip1em
\tt{http://en.wikipedia.org/wiki/Voting\_systems}
\end{center}
\end{frame}
\end{document}
\ No newline at end of file
* Outline
- Can't describe everything for you. Concentrate on a few issues.
- Short & sweet. Leave room for questions.
- How many people know about web archiving?
- I'm going to concentrate on Tika & Pig.
- SF will concentrate on the features that solr is giving us.
- Neither of us will really talk about solr tuning, etc.
- we haven't done it yet
- Maybe a little disjointed, bear with us.
- Hope that people will ask questions.
* Vital statistics
- archives, topical collections gathered by curators
- NYU, Stanford, UC campuses, UMich, USDA, etc.
* Nutch search
- nutch, open search crawler & search engine
- unacceptable response times for some searches
* Parsing
* Tika
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment