package edu.columbia.main; import org.apache.log4j.Logger; import edu.columbia.main.language_id.LanguageCode; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import java.util.ArrayList; import edu.columbia.main.LogDB; class URL{ public String url; public String language; public String title; URL(String url, String language, String title) { this.url = url; this.language=language; this.title = title; } } /** * Fetches all the transcripts in a specific languageCode from TED.COM */ public class TEDScraper { Logger log = Logger.getLogger(TEDScraper.class); /** location of transcripts */ private static final String VIDEOS_URL = "https://www.ted.com/talks/browse?language="; /** container for all the transcripts urls of a specific languageCode */ private ArrayList<URL> urls; /** the destination langauge */ private String language; private LogDB logDb; /** * Parses all the pages containing links to talks in a specific languageCode * saves it to urls and then calls getAndSaveData() * @param language destination languageCode */ public TEDScraper(String language) { this.language = language; String iso1Lang = LanguageCode.convertIso2toIso1(language); this.logDb = new LogDB(this.language); //saving text files urls = new ArrayList<URL>(2); log.info("Scraping TED.COM for subtitles in: "+ language); //get the first page and parse HTTPClient client = new HTTPClient(VIDEOS_URL + iso1Lang); String html = client.getHTMLData(); if(html.contains("We couldn't find a talk quite like that")){ log.info("TED.COM Does not have any talks in "+language + " langauge code"); log.info("Stopping process"); return; } Document doc = Jsoup.parse(html); //get the number of pages from the page's pagination Element lastPagination = doc.select(".pagination__item").last(); int numOfPages = 1; if(lastPagination != null) numOfPages = Integer.parseInt(lastPagination.text()); //for every page of that languageCode for (int i = 1; i <= numOfPages; i++) { log.info("Getting links from page: "+i +" out of: "+numOfPages); //we already fetched the first page if (i != 1) { //get the page and parse client = new HTTPClient(VIDEOS_URL + iso1Lang + "&page=" + i); html = client.getHTMLData(); doc = Jsoup.parse(html); } Elements videoContainers = doc.select(".media__message"); Elements links = videoContainers.select("a"); //add href value only to urls for (Element link : links) { //get the href value String modifiedLink = link.attr("href"); //remove everything after the ? -> /talks/ze_frank_are_you_human?languageCode=lt modifiedLink = modifiedLink.substring(0, modifiedLink.indexOf("?")); //add to array of all links urls.add(new URL("https://www.ted.com" + modifiedLink + "/transcript.json?language=" + iso1Lang, language,modifiedLink.substring((modifiedLink.indexOf("/talks/")+"/talks/".length()),modifiedLink.length()))); //urls.add(new URL("https://www.ted.com" + modifiedLink + "/transcript.json?language=en", "eng", modifiedLink.substring((modifiedLink.indexOf("/talks/")+"/talks/".length()),modifiedLink.length()))); } } getAndSaveData(); } /** * loads each url from url, fetches the transcript, parses it and saves it. */ public void getAndSaveData() { String content = ""; for(URL url : urls){ HTTPClient client = new HTTPClient(url.url); String html = client.getHTMLData(); FileSaver file = new FileSaver(html, url.language, "TED", url.url, url.title + ".json"); file.save(logDb); } } }