package dk.thoerup.spejdernetscraper; import java.util.concurrent.Callable; import java.util.concurrent.TimeUnit; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import com.google.common.cache.Cache; import com.google.common.cache.CacheBuilder; import dk.thoerup.genericjavautils.HttpUtil; public class IMDBSeriesScraper { private static Cache webCache = CacheBuilder.newBuilder() .expireAfterWrite(5, TimeUnit.MINUTES) .maximumSize(1000) .build(); private String getDoc(final String url) throws Exception { return webCache.get(url, new Callable() { @Override public String call() throws Exception { return HttpUtil.getContentString(url, 10000); } }); } public String fechTitle(String sid, int season, int episode) throws Exception { final String epMatch = "ep" + episode; final String noTitle = "Episode #" + season + "." + episode; String url = "http://www.imdb.com/title/" + sid + "/episodes?season=" + season; String rawHtml = getDoc(url); Document page = Jsoup.parse(rawHtml); Element episodesContent = page.getElementById("episodes_content"); Elements episodesList = episodesContent.getElementsByClass("list_item"); for (int i=0; i