/[projects]/miscJava/SpejdernetScraper/src/main/java/dk/thoerup/spejdernetscraper/IMDBSeriesScraper.java
ViewVC logotype

Contents of /miscJava/SpejdernetScraper/src/main/java/dk/thoerup/spejdernetscraper/IMDBSeriesScraper.java

Parent Directory Parent Directory | Revision Log Revision Log


Revision 3234 - (show annotations) (download)
Wed May 16 12:09:23 2018 UTC (6 years ago) by torben
File size: 1975 byte(s)
access imdb via HTTPS
1 package dk.thoerup.spejdernetscraper;
2
3 import java.util.concurrent.Callable;
4 import java.util.concurrent.TimeUnit;
5
6 import org.jsoup.Jsoup;
7 import org.jsoup.nodes.Document;
8 import org.jsoup.nodes.Element;
9 import org.jsoup.select.Elements;
10
11 import com.google.common.cache.Cache;
12 import com.google.common.cache.CacheBuilder;
13
14 import dk.thoerup.genericjavautils.HttpUtil;
15
16 public class IMDBSeriesScraper {
17
18 private static Cache<String,String> webCache = CacheBuilder.newBuilder()
19 .expireAfterWrite(5, TimeUnit.MINUTES)
20 .maximumSize(1000)
21 .build();
22
23 private String getDoc(final String url) throws Exception {
24
25 return webCache.get(url, new Callable<String>() {
26
27 @Override
28 public String call() throws Exception {
29 //TODO: implement something that can follow redirects
30 return HttpUtil.getContentString(url, 10000);
31 }
32 });
33
34 }
35
36 public String fechTitle(String sid, int season, int episode) throws Exception {
37
38 final String epMatch = "ep" + episode;
39 final String noTitle = "Episode #" + season + "." + episode;
40
41 String url = "https://www.imdb.com/title/" + sid + "/episodes?season=" + season;
42
43 String rawHtml = getDoc(url);
44
45 Document page = Jsoup.parse(rawHtml);
46
47 Element episodesContent = page.getElementById("episodes_content");
48
49 Elements episodesList = episodesContent.getElementsByClass("list_item");
50
51 for (int i=0; i<episodesList.size(); i++) {
52 Element curEp = episodesList.get(i);
53
54 Element image = curEp.getElementsByClass("image").first();
55
56 Element anchor = image.child(0);
57
58 String title = anchor.attr("title");
59 //String content = anchor.text().trim();
60 String href = anchor.attr("href");
61
62 if ( href.endsWith(epMatch)) {
63 if (title.equalsIgnoreCase(noTitle)) {
64 return "!Title not found!";
65 }
66
67 return title;
68 }
69 }
70
71
72 throw new Exception("Episode not found !");
73 }
74
75 }

  ViewVC Help
Powered by ViewVC 1.1.20