Friday, December 11, 2009

Extracting and creating a book from an online site

I have a subscription to an online book store (similar to safari). Unfortunately I cannot provide you with the store name. I wanted certain books from the store on my local machine as a printable pdf file. The bookstore allowed me to print only 2 pages.

Here is what I did to extract the data from the book store.
1. Figured out the data format for each page. Each page was a jpg file
2. Figured out the http request for each page. The request contained the ISBN nuber the page number and the resolution
3. Using httpfox - a firefox plugin to introspect the http data sent accross.
4. Wrote this application to fetch all the pages for a set of books and save the image on the disk additionally it also creates the pdf file with all the images.

Rather than using a java application I could have used some kinda scripting language. I may research into that later on if I get the time, as of now I can print a few pages that I want to read :-)

Here is the java source code for Main.java

package bookextracter;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.net.MalformedURLException;

import javax.imageio.stream.FileImageOutputStream;

import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import com.itextpdf.text.BadElementException;
import com.itextpdf.text.Document;
import com.itextpdf.text.DocumentException;
import com.itextpdf.text.Image;
import com.itextpdf.text.PageSize;
import com.itextpdf.text.pdf.PdfWriter;

/**
 * Library Dependencies
 *   iText (http://itextpdf.com/index.php)
 *   Used for creating pdf documents.
 *   Some really good example code using iText (http://www.roseindia.net/java/itext/index.shtml)
 * 
 *   Apache Commons Logger (http://commons.apache.org/logging/) 
 *   A generic interface for logging.
 * 
 * The high level steps are as follows
 * 1. Log into the online book site using your browser
 * 2. Use an HTTP monitor (something like httpfox for firefox) to introspect the request/response messages
 * 3. As your are browsing through the online book pages note the cookie headers in the GET request for each page. Note down this cookie string 
 * since you will be using it in this program
 * 
 * 
 */
public class Main {

 static Log _logger = LogFactory.getLog(Main.class);
 
 /**
  * The cookie that is unique to my login.
  */
 public static final String cookie = "not real cookie";
 
 /**
  * Each pages is a jpg image and this determines the scaling factor when we make a request from the server. 
  */
 public static final int MAX_SCALE = 1200;
 
 /**
  * I loop through these many number of pages till reach the end or the server responses with code '500'
  * Here I assume that all the books contain a max of 1500 pages.
  */
 public static final int MAX_ESTIMATE_BOOK_SIZE_IN_PGS = 1500;
 
 public static final String bookSaveLocation = "C:\\Extract\\Books\\";
 public static final String rawImageSaveLocation = "C:\\Extract\\RawImages\\";

 public static void main(String[] args) throws HttpException, IOException,
   DocumentException, InterruptedException {

  
  // These are the ISBN numbers of the books that I am interested in.(not real)
  String[] bookIsbs = { "234234234", "1234234234", "234234234"};
  
  // loop through each book and fetch all the pages for each book in a different thread.
  for (String isbn : bookIsbs) {
   (new Thread(new BookFetcher(isbn))).start();
  }
  
  // Wait for it.
  System.in.read();

 }

 private static class BookFetcher implements Runnable {
  private String isbn;

  public BookFetcher(String isbn) {
   this.isbn = isbn;
  }

  @Override
  public void run() {
   try {
    extractAndSaveBook(isbn);
   } catch (Exception e) {
    _logger.error(e);
    e.printStackTrace();
   }

  }
 }

 private static void extractAndSaveBook(String isbn) throws IOException,
   HttpException, FileNotFoundException, DocumentException {
  
  // The format of the pdf doc.
  Document pdfDoc = new Document(PageSize.A2, 0, 0, 0, 0);
//  Document pdfDoc = new Document(PageSize.LETTER,0,0,0,0);
//  Document pdfDoc = new Document();  
  
  // The file name and location of the pdf file 
  final String pdfFileName = bookSaveLocation + isbn + ".pdf";
  PdfWriter.getInstance(pdfDoc, new FileOutputStream(pdfFileName));
  pdfDoc.open();

  // Set up the http Connection
  _logger.debug("Creating an Http Client");
  HttpClient httpClient = new HttpClient();
  httpClient.getHostConfiguration().setProxy("webproxy", 80); // I use a proxy.

  _logger.info("Retriving Book " + isbn);
  
  // Loop through each page.
  for (int currentPageNumber = 0; currentPageNumber < MAX_ESTIMATE_BOOK_SIZE_IN_PGS; currentPageNumber++) {
   
   // Create the page url
   String pageUrl = construcPageUrl(isbn, currentPageNumber, MAX_SCALE);
   _logger.debug("Retrive Page: " + pageUrl);
   
   // Print the page number do indicate progress.
   System.out.print(currentPageNumber + "..");
   if ((currentPageNumber + 1) % 30 == 0)
    System.out.println("");

   // The retrived image for each page will be saved at the following location (file format: <rawImageSaveLocation>/<isbn>/Page_<currentPageNo>.jpg 
   String pagePathName = rawImageSaveLocation + isbn + "\\Page_" + currentPageNumber
     + ".jpg";
   
   File imageFile = new File(pagePathName);
   // If the file dosent exist at the location the fetch the file from the server.
   // This prevents us from hitting the server during multiple runs, since fetching 
   // each page is the most time consuming operation.
   if (!(imageFile.exists() && imageFile.isFile())) { 
    GetMethod getPage = new GetMethod(pageUrl);
    getPage.setRequestHeader("Cookie", cookie); // Setup the http GET request with my cookie 

    int responseCode = httpClient.executeMethod(getPage);
    if (responseCode != 200) {
     _logger.info("Page Not found, I assume we are done with the book");
     break;
    }

    // Get the data
    _logger.debug("Get the Page");
    byte[] pageRawByteArray = getPage.getResponseBody();
    
    // Save the Raw Image to file
    saveRawImageToFile(pagePathName, pageRawByteArray);
   }
   // Save the image page to pdf.
   savePageImageToPdfDoc(pdfDoc, pdfFileName,
     pagePathName);
  }
  System.out.println("");
  _logger.info("Book " + isbn + " ....Done");
  pdfDoc.close();

 }

 private static void savePageImageToPdfDoc(Document pdfDoc,
   final String pdfFileName, String pagePathName)
   throws BadElementException, MalformedURLException, IOException,
   DocumentException {
  _logger.debug("Saving to pdf Doc " + pdfFileName);
  // Image pagePdfImage = Image.getInstance(pageRawByteArray);
  Image pagePdfImage = Image.getInstance(pagePathName);
  // pagePdfImage.scalePercent(50);
  pdfDoc.add(pagePdfImage);
 }

 private static void saveRawImageToFile(String pagePathName,
   byte[] pageRawByteArray) throws FileNotFoundException, IOException {
  _logger.debug("Saving Image to a file : " + pagePathName);
  FileImageOutputStream imageWriter = new FileImageOutputStream(
    new File(pagePathName));
  imageWriter.write(pageRawByteArray);
  imageWriter.close();
 }

 private static String construcPageUrl(String bookIsbn, int pageNo, int scale) {
  return "http://booksitehostname/" + bookIsbn + "
    + scale + "/" + pageNo + ".jpg";
 }

}

No comments: