Here is what I did to extract the data from the book store.
1. Figured out the data format for each page. Each page was a jpg file
2. Figured out the http request for each page. The request contained the ISBN nuber the page number and the resolution
3. Using httpfox - a firefox plugin to introspect the http data sent accross.
4. Wrote this application to fetch all the pages for a set of books and save the image on the disk additionally it also creates the pdf file with all the images.
Rather than using a java application I could have used some kinda scripting language. I may research into that later on if I get the time, as of now I can print a few pages that I want to read :-)
Here is the java source code for Main.java
package bookextracter;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.net.MalformedURLException;
import javax.imageio.stream.FileImageOutputStream;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import com.itextpdf.text.BadElementException;
import com.itextpdf.text.Document;
import com.itextpdf.text.DocumentException;
import com.itextpdf.text.Image;
import com.itextpdf.text.PageSize;
import com.itextpdf.text.pdf.PdfWriter;
/**
* Library Dependencies
* iText (http://itextpdf.com/index.php)
* Used for creating pdf documents.
* Some really good example code using iText (http://www.roseindia.net/java/itext/index.shtml)
*
* Apache Commons Logger (http://commons.apache.org/logging/)
* A generic interface for logging.
*
* The high level steps are as follows
* 1. Log into the online book site using your browser
* 2. Use an HTTP monitor (something like httpfox for firefox) to introspect the request/response messages
* 3. As your are browsing through the online book pages note the cookie headers in the GET request for each page. Note down this cookie string
* since you will be using it in this program
*
*
*/
public class Main {
static Log _logger = LogFactory.getLog(Main.class);
/**
* The cookie that is unique to my login.
*/
public static final String cookie = "not real cookie";
/**
* Each pages is a jpg image and this determines the scaling factor when we make a request from the server.
*/
public static final int MAX_SCALE = 1200;
/**
* I loop through these many number of pages till reach the end or the server responses with code '500'
* Here I assume that all the books contain a max of 1500 pages.
*/
public static final int MAX_ESTIMATE_BOOK_SIZE_IN_PGS = 1500;
public static final String bookSaveLocation = "C:\\Extract\\Books\\";
public static final String rawImageSaveLocation = "C:\\Extract\\RawImages\\";
public static void main(String[] args) throws HttpException, IOException,
DocumentException, InterruptedException {
// These are the ISBN numbers of the books that I am interested in.(not real)
String[] bookIsbs = { "234234234", "1234234234", "234234234"};
// loop through each book and fetch all the pages for each book in a different thread.
for (String isbn : bookIsbs) {
(new Thread(new BookFetcher(isbn))).start();
}
// Wait for it.
System.in.read();
}
private static class BookFetcher implements Runnable {
private String isbn;
public BookFetcher(String isbn) {
this.isbn = isbn;
}
@Override
public void run() {
try {
extractAndSaveBook(isbn);
} catch (Exception e) {
_logger.error(e);
e.printStackTrace();
}
}
}
private static void extractAndSaveBook(String isbn) throws IOException,
HttpException, FileNotFoundException, DocumentException {
// The format of the pdf doc.
Document pdfDoc = new Document(PageSize.A2, 0, 0, 0, 0);
// Document pdfDoc = new Document(PageSize.LETTER,0,0,0,0);
// Document pdfDoc = new Document();
// The file name and location of the pdf file
final String pdfFileName = bookSaveLocation + isbn + ".pdf";
PdfWriter.getInstance(pdfDoc, new FileOutputStream(pdfFileName));
pdfDoc.open();
// Set up the http Connection
_logger.debug("Creating an Http Client");
HttpClient httpClient = new HttpClient();
httpClient.getHostConfiguration().setProxy("webproxy", 80); // I use a proxy.
_logger.info("Retriving Book " + isbn);
// Loop through each page.
for (int currentPageNumber = 0; currentPageNumber < MAX_ESTIMATE_BOOK_SIZE_IN_PGS; currentPageNumber++) {
// Create the page url
String pageUrl = construcPageUrl(isbn, currentPageNumber, MAX_SCALE);
_logger.debug("Retrive Page: " + pageUrl);
// Print the page number do indicate progress.
System.out.print(currentPageNumber + "..");
if ((currentPageNumber + 1) % 30 == 0)
System.out.println("");
// The retrived image for each page will be saved at the following location (file format: <rawImageSaveLocation>/<isbn>/Page_<currentPageNo>.jpg
String pagePathName = rawImageSaveLocation + isbn + "\\Page_" + currentPageNumber
+ ".jpg";
File imageFile = new File(pagePathName);
// If the file dosent exist at the location the fetch the file from the server.
// This prevents us from hitting the server during multiple runs, since fetching
// each page is the most time consuming operation.
if (!(imageFile.exists() && imageFile.isFile())) {
GetMethod getPage = new GetMethod(pageUrl);
getPage.setRequestHeader("Cookie", cookie); // Setup the http GET request with my cookie
int responseCode = httpClient.executeMethod(getPage);
if (responseCode != 200) {
_logger.info("Page Not found, I assume we are done with the book");
break;
}
// Get the data
_logger.debug("Get the Page");
byte[] pageRawByteArray = getPage.getResponseBody();
// Save the Raw Image to file
saveRawImageToFile(pagePathName, pageRawByteArray);
}
// Save the image page to pdf.
savePageImageToPdfDoc(pdfDoc, pdfFileName,
pagePathName);
}
System.out.println("");
_logger.info("Book " + isbn + " ....Done");
pdfDoc.close();
}
private static void savePageImageToPdfDoc(Document pdfDoc,
final String pdfFileName, String pagePathName)
throws BadElementException, MalformedURLException, IOException,
DocumentException {
_logger.debug("Saving to pdf Doc " + pdfFileName);
// Image pagePdfImage = Image.getInstance(pageRawByteArray);
Image pagePdfImage = Image.getInstance(pagePathName);
// pagePdfImage.scalePercent(50);
pdfDoc.add(pagePdfImage);
}
private static void saveRawImageToFile(String pagePathName,
byte[] pageRawByteArray) throws FileNotFoundException, IOException {
_logger.debug("Saving Image to a file : " + pagePathName);
FileImageOutputStream imageWriter = new FileImageOutputStream(
new File(pagePathName));
imageWriter.write(pageRawByteArray);
imageWriter.close();
}
private static String construcPageUrl(String bookIsbn, int pageNo, int scale) {
return "http://booksitehostname/" + bookIsbn + "
+ scale + "/" + pageNo + ".jpg";
}
}
No comments:
Post a Comment