1 |
package dk.thoerup.spejdernetscraper; |
package dk.thoerup.spejdernetscraper; |
2 |
|
|
3 |
|
import java.util.concurrent.Callable; |
4 |
|
import java.util.concurrent.TimeUnit; |
5 |
|
|
6 |
import org.jsoup.Jsoup; |
import org.jsoup.Jsoup; |
7 |
import org.jsoup.nodes.Document; |
import org.jsoup.nodes.Document; |
8 |
import org.jsoup.nodes.Element; |
import org.jsoup.nodes.Element; |
9 |
import org.jsoup.select.Elements; |
import org.jsoup.select.Elements; |
10 |
|
|
11 |
|
import com.google.common.cache.Cache; |
12 |
|
import com.google.common.cache.CacheBuilder; |
13 |
|
|
14 |
|
import dk.thoerup.genericjavautils.HttpUtil; |
15 |
|
|
16 |
public class IMDBSeriesScraper { |
public class IMDBSeriesScraper { |
17 |
|
|
18 |
|
private static Cache<String,String> webCache = CacheBuilder.newBuilder() |
19 |
|
.expireAfterWrite(5, TimeUnit.MINUTES) |
20 |
|
.maximumSize(1000) |
21 |
|
.build(); |
22 |
|
|
23 |
|
private String getDoc(final String url) throws Exception { |
24 |
|
|
25 |
|
return webCache.get(url, new Callable<String>() { |
26 |
|
|
27 |
|
@Override |
28 |
|
public String call() throws Exception { |
29 |
|
return HttpUtil.getContentString(url, 10000); |
30 |
|
} |
31 |
|
}); |
32 |
|
|
33 |
|
} |
34 |
|
|
35 |
public String fechTitle(String sid, int season, int episode) throws Exception { |
public String fechTitle(String sid, int season, int episode) throws Exception { |
36 |
|
|
37 |
final String epMatch = "ep" + episode; |
final String epMatch = "ep" + episode; |
39 |
|
|
40 |
String url = "http://www.imdb.com/title/" + sid + "/episodes?season=" + season; |
String url = "http://www.imdb.com/title/" + sid + "/episodes?season=" + season; |
41 |
|
|
42 |
Document page = Jsoup.connect(url) |
String rawHtml = getDoc(url); |
43 |
.get(); |
|
44 |
|
Document page = Jsoup.parse(rawHtml); |
45 |
|
|
46 |
Element episodesContent = page.getElementById("episodes_content"); |
Element episodesContent = page.getElementById("episodes_content"); |
47 |
|
|