/[projects]/miscJava/SpejdernetScraper/src/main/java/dk/thoerup/spejdernetscraper/IMDBSeriesScraper.java
ViewVC logotype

Annotation of /miscJava/SpejdernetScraper/src/main/java/dk/thoerup/spejdernetscraper/IMDBSeriesScraper.java

Parent Directory Parent Directory | Revision Log Revision Log


Revision 3231 - (hide annotations) (download)
Wed May 16 11:39:19 2018 UTC (6 years ago) by torben
File size: 1914 byte(s)
Cache html page from source
1 torben 2067 package dk.thoerup.spejdernetscraper;
2    
3 torben 3231 import java.util.concurrent.Callable;
4     import java.util.concurrent.TimeUnit;
5    
6 torben 2067 import org.jsoup.Jsoup;
7     import org.jsoup.nodes.Document;
8     import org.jsoup.nodes.Element;
9     import org.jsoup.select.Elements;
10    
11 torben 3231 import com.google.common.cache.Cache;
12     import com.google.common.cache.CacheBuilder;
13    
14     import dk.thoerup.genericjavautils.HttpUtil;
15    
16 torben 2067 public class IMDBSeriesScraper {
17    
18 torben 3231 private static Cache<String,String> webCache = CacheBuilder.newBuilder()
19     .expireAfterWrite(5, TimeUnit.MINUTES)
20     .maximumSize(1000)
21     .build();
22    
23     private String getDoc(final String url) throws Exception {
24    
25     return webCache.get(url, new Callable<String>() {
26    
27     @Override
28     public String call() throws Exception {
29     return HttpUtil.getContentString(url, 10000);
30     }
31     });
32    
33     }
34    
35 torben 2067 public String fechTitle(String sid, int season, int episode) throws Exception {
36    
37 torben 3230 final String epMatch = "ep" + episode;
38 torben 2068 final String noTitle = "Episode #" + season + "." + episode;
39 torben 2067
40     String url = "http://www.imdb.com/title/" + sid + "/episodes?season=" + season;
41    
42 torben 3231 String rawHtml = getDoc(url);
43 torben 2067
44 torben 3231 Document page = Jsoup.parse(rawHtml);
45    
46 torben 2067 Element episodesContent = page.getElementById("episodes_content");
47    
48     Elements episodesList = episodesContent.getElementsByClass("list_item");
49    
50     for (int i=0; i<episodesList.size(); i++) {
51     Element curEp = episodesList.get(i);
52    
53     Element image = curEp.getElementsByClass("image").first();
54    
55     Element anchor = image.child(0);
56    
57     String title = anchor.attr("title");
58 torben 3230 //String content = anchor.text().trim();
59     String href = anchor.attr("href");
60 torben 2067
61 torben 3230 if ( href.endsWith(epMatch)) {
62 torben 2119 if (title.equalsIgnoreCase(noTitle)) {
63     return "!Title not found!";
64     }
65 torben 2068
66 torben 2067 return title;
67     }
68     }
69    
70    
71     throw new Exception("Episode not found !");
72     }
73    
74     }

  ViewVC Help
Powered by ViewVC 1.1.20