Here is what I did to extract the data from the book store.
1. Figured out the data format for each page. Each page was a jpg file
2. Figured out the http request for each page. The request contained the ISBN nuber the page number and the resolution
3. Using httpfox - a firefox plugin to introspect the http data sent accross.
4. Wrote this application to fetch all the pages for a set of books and save the image on the disk additionally it also creates the pdf file with all the images.
Rather than using a java application I could have used some kinda scripting language. I may research into that later on if I get the time, as of now I can print a few pages that I want to read :-)
Here is the java source code for Main.java
package bookextracter; import java.io.File; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.net.MalformedURLException; import javax.imageio.stream.FileImageOutputStream; import org.apache.commons.httpclient.HttpClient; import org.apache.commons.httpclient.HttpException; import org.apache.commons.httpclient.methods.GetMethod; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import com.itextpdf.text.BadElementException; import com.itextpdf.text.Document; import com.itextpdf.text.DocumentException; import com.itextpdf.text.Image; import com.itextpdf.text.PageSize; import com.itextpdf.text.pdf.PdfWriter; /** * Library Dependencies * iText (http://itextpdf.com/index.php) * Used for creating pdf documents. * Some really good example code using iText (http://www.roseindia.net/java/itext/index.shtml) * * Apache Commons Logger (http://commons.apache.org/logging/) * A generic interface for logging. * * The high level steps are as follows * 1. Log into the online book site using your browser * 2. Use an HTTP monitor (something like httpfox for firefox) to introspect the request/response messages * 3. As your are browsing through the online book pages note the cookie headers in the GET request for each page. Note down this cookie string * since you will be using it in this program * * */ public class Main { static Log _logger = LogFactory.getLog(Main.class); /** * The cookie that is unique to my login. */ public static final String cookie = "not real cookie"; /** * Each pages is a jpg image and this determines the scaling factor when we make a request from the server. */ public static final int MAX_SCALE = 1200; /** * I loop through these many number of pages till reach the end or the server responses with code '500' * Here I assume that all the books contain a max of 1500 pages. */ public static final int MAX_ESTIMATE_BOOK_SIZE_IN_PGS = 1500; public static final String bookSaveLocation = "C:\\Extract\\Books\\"; public static final String rawImageSaveLocation = "C:\\Extract\\RawImages\\"; public static void main(String[] args) throws HttpException, IOException, DocumentException, InterruptedException { // These are the ISBN numbers of the books that I am interested in.(not real) String[] bookIsbs = { "234234234", "1234234234", "234234234"}; // loop through each book and fetch all the pages for each book in a different thread. for (String isbn : bookIsbs) { (new Thread(new BookFetcher(isbn))).start(); } // Wait for it. System.in.read(); } private static class BookFetcher implements Runnable { private String isbn; public BookFetcher(String isbn) { this.isbn = isbn; } @Override public void run() { try { extractAndSaveBook(isbn); } catch (Exception e) { _logger.error(e); e.printStackTrace(); } } } private static void extractAndSaveBook(String isbn) throws IOException, HttpException, FileNotFoundException, DocumentException { // The format of the pdf doc. Document pdfDoc = new Document(PageSize.A2, 0, 0, 0, 0); // Document pdfDoc = new Document(PageSize.LETTER,0,0,0,0); // Document pdfDoc = new Document(); // The file name and location of the pdf file final String pdfFileName = bookSaveLocation + isbn + ".pdf"; PdfWriter.getInstance(pdfDoc, new FileOutputStream(pdfFileName)); pdfDoc.open(); // Set up the http Connection _logger.debug("Creating an Http Client"); HttpClient httpClient = new HttpClient(); httpClient.getHostConfiguration().setProxy("webproxy", 80); // I use a proxy. _logger.info("Retriving Book " + isbn); // Loop through each page. for (int currentPageNumber = 0; currentPageNumber < MAX_ESTIMATE_BOOK_SIZE_IN_PGS; currentPageNumber++) { // Create the page url String pageUrl = construcPageUrl(isbn, currentPageNumber, MAX_SCALE); _logger.debug("Retrive Page: " + pageUrl); // Print the page number do indicate progress. System.out.print(currentPageNumber + ".."); if ((currentPageNumber + 1) % 30 == 0) System.out.println(""); // The retrived image for each page will be saved at the following location (file format: <rawImageSaveLocation>/<isbn>/Page_<currentPageNo>.jpg String pagePathName = rawImageSaveLocation + isbn + "\\Page_" + currentPageNumber + ".jpg"; File imageFile = new File(pagePathName); // If the file dosent exist at the location the fetch the file from the server. // This prevents us from hitting the server during multiple runs, since fetching // each page is the most time consuming operation. if (!(imageFile.exists() && imageFile.isFile())) { GetMethod getPage = new GetMethod(pageUrl); getPage.setRequestHeader("Cookie", cookie); // Setup the http GET request with my cookie int responseCode = httpClient.executeMethod(getPage); if (responseCode != 200) { _logger.info("Page Not found, I assume we are done with the book"); break; } // Get the data _logger.debug("Get the Page"); byte[] pageRawByteArray = getPage.getResponseBody(); // Save the Raw Image to file saveRawImageToFile(pagePathName, pageRawByteArray); } // Save the image page to pdf. savePageImageToPdfDoc(pdfDoc, pdfFileName, pagePathName); } System.out.println(""); _logger.info("Book " + isbn + " ....Done"); pdfDoc.close(); } private static void savePageImageToPdfDoc(Document pdfDoc, final String pdfFileName, String pagePathName) throws BadElementException, MalformedURLException, IOException, DocumentException { _logger.debug("Saving to pdf Doc " + pdfFileName); // Image pagePdfImage = Image.getInstance(pageRawByteArray); Image pagePdfImage = Image.getInstance(pagePathName); // pagePdfImage.scalePercent(50); pdfDoc.add(pagePdfImage); } private static void saveRawImageToFile(String pagePathName, byte[] pageRawByteArray) throws FileNotFoundException, IOException { _logger.debug("Saving Image to a file : " + pagePathName); FileImageOutputStream imageWriter = new FileImageOutputStream( new File(pagePathName)); imageWriter.write(pageRawByteArray); imageWriter.close(); } private static String construcPageUrl(String bookIsbn, int pageNo, int scale) { return "http://booksitehostname/" + bookIsbn + " + scale + "/" + pageNo + ".jpg"; } }
No comments:
Post a Comment