--- miscJava/SpejdernetScraper/src/dk/thoerup/spejdernetscraper/IMDBSeriesScraper.java 2014/02/11 20:42:35 2119 +++ miscJava/SpejdernetScraper/src/main/java/dk/thoerup/spejdernetscraper/IMDBSeriesScraper.java 2018/05/16 12:09:23 3234 @@ -1,21 +1,48 @@ package dk.thoerup.spejdernetscraper; +import java.util.concurrent.Callable; +import java.util.concurrent.TimeUnit; + import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; +import com.google.common.cache.Cache; +import com.google.common.cache.CacheBuilder; + +import dk.thoerup.genericjavautils.HttpUtil; + public class IMDBSeriesScraper { + private static Cache webCache = CacheBuilder.newBuilder() + .expireAfterWrite(5, TimeUnit.MINUTES) + .maximumSize(1000) + .build(); + + private String getDoc(final String url) throws Exception { + + return webCache.get(url, new Callable() { + + @Override + public String call() throws Exception { + //TODO: implement something that can follow redirects + return HttpUtil.getContentString(url, 10000); + } + }); + + } + public String fechTitle(String sid, int season, int episode) throws Exception { - final String epMatch = "Ep" + episode; + final String epMatch = "ep" + episode; final String noTitle = "Episode #" + season + "." + episode; - String url = "http://www.imdb.com/title/" + sid + "/episodes?season=" + season; + String url = "https://www.imdb.com/title/" + sid + "/episodes?season=" + season; + + String rawHtml = getDoc(url); - Document page = Jsoup.connect(url) - .get(); + Document page = Jsoup.parse(rawHtml); Element episodesContent = page.getElementById("episodes_content"); @@ -29,9 +56,10 @@ Element anchor = image.child(0); String title = anchor.attr("title"); - String content = anchor.text().trim(); + //String content = anchor.text().trim(); + String href = anchor.attr("href"); - if ( content.endsWith(epMatch)) { + if ( href.endsWith(epMatch)) { if (title.equalsIgnoreCase(noTitle)) { return "!Title not found!"; }