1 |
package dk.thoerup.spejdernetscraper; |
package dk.thoerup.spejdernetscraper; |
2 |
|
|
3 |
|
import java.util.concurrent.Callable; |
4 |
|
import java.util.concurrent.TimeUnit; |
5 |
|
|
6 |
import org.jsoup.Jsoup; |
import org.jsoup.Jsoup; |
7 |
import org.jsoup.nodes.Document; |
import org.jsoup.nodes.Document; |
8 |
import org.jsoup.nodes.Element; |
import org.jsoup.nodes.Element; |
9 |
import org.jsoup.select.Elements; |
import org.jsoup.select.Elements; |
10 |
|
|
11 |
|
import com.google.common.cache.Cache; |
12 |
|
import com.google.common.cache.CacheBuilder; |
13 |
|
|
14 |
|
import dk.thoerup.genericjavautils.HttpUtil; |
15 |
|
|
16 |
public class IMDBSeriesScraper { |
public class IMDBSeriesScraper { |
17 |
|
|
18 |
|
private static Cache<String,String> webCache = CacheBuilder.newBuilder() |
19 |
|
.expireAfterWrite(5, TimeUnit.MINUTES) |
20 |
|
.maximumSize(1000) |
21 |
|
.build(); |
22 |
|
|
23 |
|
private String getDoc(final String url) throws Exception { |
24 |
|
|
25 |
|
return webCache.get(url, new Callable<String>() { |
26 |
|
|
27 |
|
@Override |
28 |
|
public String call() throws Exception { |
29 |
|
return HttpUtil.getContentString(url, 10000); |
30 |
|
} |
31 |
|
}); |
32 |
|
|
33 |
|
} |
34 |
|
|
35 |
public String fechTitle(String sid, int season, int episode) throws Exception { |
public String fechTitle(String sid, int season, int episode) throws Exception { |
36 |
|
|
37 |
final String epMatch = "Ep" + episode; |
final String epMatch = "ep" + episode; |
38 |
final String noTitle = "Episode #" + season + "." + episode; |
final String noTitle = "Episode #" + season + "." + episode; |
39 |
|
|
40 |
String url = "http://www.imdb.com/title/" + sid + "/episodes?season=" + season; |
String url = "http://www.imdb.com/title/" + sid + "/episodes?season=" + season; |
41 |
|
|
42 |
Document page = Jsoup.connect(url) |
String rawHtml = getDoc(url); |
43 |
.get(); |
|
44 |
|
Document page = Jsoup.parse(rawHtml); |
45 |
|
|
46 |
Element episodesContent = page.getElementById("episodes_content"); |
Element episodesContent = page.getElementById("episodes_content"); |
47 |
|
|
55 |
Element anchor = image.child(0); |
Element anchor = image.child(0); |
56 |
|
|
57 |
String title = anchor.attr("title"); |
String title = anchor.attr("title"); |
58 |
String content = anchor.text().trim(); |
//String content = anchor.text().trim(); |
59 |
|
String href = anchor.attr("href"); |
60 |
|
|
61 |
if ( content.endsWith(epMatch)) { |
if ( href.endsWith(epMatch)) { |
62 |
if (title.equalsIgnoreCase(noTitle)) |
if (title.equalsIgnoreCase(noTitle)) { |
63 |
throw new Exception("Episode found without title"); |
return "!Title not found!"; |
64 |
|
} |
65 |
|
|
66 |
return title; |
return title; |
67 |
} |
} |