/[projects]/miscJava/SpejdernetScraper/src/main/java/dk/thoerup/spejdernetscraper/IMDBSeriesScraper.java
ViewVC logotype

Contents of /miscJava/SpejdernetScraper/src/main/java/dk/thoerup/spejdernetscraper/IMDBSeriesScraper.java

Parent Directory Parent Directory | Revision Log Revision Log


Revision 3231 - (show annotations) (download)
Wed May 16 11:39:19 2018 UTC (6 years ago) by torben
File size: 1914 byte(s)
Cache html page from source
1 package dk.thoerup.spejdernetscraper;
2
3 import java.util.concurrent.Callable;
4 import java.util.concurrent.TimeUnit;
5
6 import org.jsoup.Jsoup;
7 import org.jsoup.nodes.Document;
8 import org.jsoup.nodes.Element;
9 import org.jsoup.select.Elements;
10
11 import com.google.common.cache.Cache;
12 import com.google.common.cache.CacheBuilder;
13
14 import dk.thoerup.genericjavautils.HttpUtil;
15
16 public class IMDBSeriesScraper {
17
18 private static Cache<String,String> webCache = CacheBuilder.newBuilder()
19 .expireAfterWrite(5, TimeUnit.MINUTES)
20 .maximumSize(1000)
21 .build();
22
23 private String getDoc(final String url) throws Exception {
24
25 return webCache.get(url, new Callable<String>() {
26
27 @Override
28 public String call() throws Exception {
29 return HttpUtil.getContentString(url, 10000);
30 }
31 });
32
33 }
34
35 public String fechTitle(String sid, int season, int episode) throws Exception {
36
37 final String epMatch = "ep" + episode;
38 final String noTitle = "Episode #" + season + "." + episode;
39
40 String url = "http://www.imdb.com/title/" + sid + "/episodes?season=" + season;
41
42 String rawHtml = getDoc(url);
43
44 Document page = Jsoup.parse(rawHtml);
45
46 Element episodesContent = page.getElementById("episodes_content");
47
48 Elements episodesList = episodesContent.getElementsByClass("list_item");
49
50 for (int i=0; i<episodesList.size(); i++) {
51 Element curEp = episodesList.get(i);
52
53 Element image = curEp.getElementsByClass("image").first();
54
55 Element anchor = image.child(0);
56
57 String title = anchor.attr("title");
58 //String content = anchor.text().trim();
59 String href = anchor.attr("href");
60
61 if ( href.endsWith(epMatch)) {
62 if (title.equalsIgnoreCase(noTitle)) {
63 return "!Title not found!";
64 }
65
66 return title;
67 }
68 }
69
70
71 throw new Exception("Episode not found !");
72 }
73
74 }

  ViewVC Help
Powered by ViewVC 1.1.20