1 |
torben |
2067 |
package dk.thoerup.spejdernetscraper;
|
2 |
|
|
|
3 |
torben |
3231 |
import java.util.concurrent.Callable;
|
4 |
|
|
import java.util.concurrent.TimeUnit;
|
5 |
|
|
|
6 |
torben |
2067 |
import org.jsoup.Jsoup;
|
7 |
|
|
import org.jsoup.nodes.Document;
|
8 |
|
|
import org.jsoup.nodes.Element;
|
9 |
|
|
import org.jsoup.select.Elements;
|
10 |
|
|
|
11 |
torben |
3231 |
import com.google.common.cache.Cache;
|
12 |
|
|
import com.google.common.cache.CacheBuilder;
|
13 |
|
|
|
14 |
|
|
import dk.thoerup.genericjavautils.HttpUtil;
|
15 |
|
|
|
16 |
torben |
2067 |
public class IMDBSeriesScraper {
|
17 |
|
|
|
18 |
torben |
3231 |
private static Cache<String,String> webCache = CacheBuilder.newBuilder()
|
19 |
|
|
.expireAfterWrite(5, TimeUnit.MINUTES)
|
20 |
|
|
.maximumSize(1000)
|
21 |
|
|
.build();
|
22 |
|
|
|
23 |
|
|
private String getDoc(final String url) throws Exception {
|
24 |
|
|
|
25 |
|
|
return webCache.get(url, new Callable<String>() {
|
26 |
|
|
|
27 |
|
|
@Override
|
28 |
|
|
public String call() throws Exception {
|
29 |
|
|
return HttpUtil.getContentString(url, 10000);
|
30 |
|
|
}
|
31 |
|
|
});
|
32 |
|
|
|
33 |
|
|
}
|
34 |
|
|
|
35 |
torben |
2067 |
public String fechTitle(String sid, int season, int episode) throws Exception {
|
36 |
|
|
|
37 |
torben |
3230 |
final String epMatch = "ep" + episode;
|
38 |
torben |
2068 |
final String noTitle = "Episode #" + season + "." + episode;
|
39 |
torben |
2067 |
|
40 |
|
|
String url = "http://www.imdb.com/title/" + sid + "/episodes?season=" + season;
|
41 |
|
|
|
42 |
torben |
3231 |
String rawHtml = getDoc(url);
|
43 |
torben |
2067 |
|
44 |
torben |
3231 |
Document page = Jsoup.parse(rawHtml);
|
45 |
|
|
|
46 |
torben |
2067 |
Element episodesContent = page.getElementById("episodes_content");
|
47 |
|
|
|
48 |
|
|
Elements episodesList = episodesContent.getElementsByClass("list_item");
|
49 |
|
|
|
50 |
|
|
for (int i=0; i<episodesList.size(); i++) {
|
51 |
|
|
Element curEp = episodesList.get(i);
|
52 |
|
|
|
53 |
|
|
Element image = curEp.getElementsByClass("image").first();
|
54 |
|
|
|
55 |
|
|
Element anchor = image.child(0);
|
56 |
|
|
|
57 |
|
|
String title = anchor.attr("title");
|
58 |
torben |
3230 |
//String content = anchor.text().trim();
|
59 |
|
|
String href = anchor.attr("href");
|
60 |
torben |
2067 |
|
61 |
torben |
3230 |
if ( href.endsWith(epMatch)) {
|
62 |
torben |
2119 |
if (title.equalsIgnoreCase(noTitle)) {
|
63 |
|
|
return "!Title not found!";
|
64 |
|
|
}
|
65 |
torben |
2068 |
|
66 |
torben |
2067 |
return title;
|
67 |
|
|
}
|
68 |
|
|
}
|
69 |
|
|
|
70 |
|
|
|
71 |
|
|
throw new Exception("Episode not found !");
|
72 |
|
|
}
|
73 |
|
|
|
74 |
|
|
}
|