Friday, April 16, 2010

Python application for downloading a book and converting to pdf

A similar application in python for downloading book images from the net and converting it to a pdf file, but this time using python

'''
Created on 25 Mar 2010


'''
import httplib
import os
import sys
from optparse import OptionParser

import time, thread
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import letter, A4
from reportlab.lib.units import cm, mm, inch, pica

write = sys.stdout.write
h = httplib.HTTPConnection('webproxy',8080)
headers = {'Cookie':''}
size = 1800
urltemplate = '/%s/pages/%i.jpg?width=1800'
books = ('isbn2','isbn1')
maxPages = 10000

def downloadBookImages(folderLocation,isbn):
 bookLocation = folderLocation + "\\%s" % isbn
 print 'Downloading book to %s' % bookLocation
 #Check if the folder location exists
 if os.path.exists(bookLocation) == False:
  os.mkdir(bookLocation) # If not create the folder
 h = httplib.HTTPConnection('webproxy',8080)
 
 imageFilePathTemplate = bookLocation + "\\%i.jpg"
 # Loop through all the pages for the book and save the page
 for page in range(maxPages):
  # Create the image file path
  imageFilePath = imageFilePathTemplate % page;
  # Check if the file exists
  if os.path.exists(imageFilePath) == True:
   write('E%i ' % page)  
   continue
  url = urltemplate % (isbn,page)
  h.request('GET', url,headers=headers)
  write('.')
  responce = h.getresponse()
  if responce.status == 401:
   print 'Not Authorized Book %s, the cookie has expired' % isbn
   break;
  data = responce.read()
  if len(data) < 1000:
   print 'End of book %s since the page size is too small : %i' % (isbn,len(data))
   break
  f = open(imageFilePath,'wb')
  f.write(data)
  f.close()


def pdfDirectory(outputPDFName, imageDirectory ):
    dirim = str(imageDirectory)
    output = str(outputPDFName)
    print 'Converting to pdf %s, images %s' % (output,dirim)
    width, height = A4
    height, width = A4
    c = canvas.Canvas(output, pagesize=A4)
    for root, dirs, files in os.walk(dirim):
     nopage = len(files)
     for name in range(nopage):
      #print name/nopage
      write('.')
      name = str(name) + ".jpg"
      filepath = os.path.join(root, name)
      #if filepath.endswith('20.jpg'): break
      c.drawImage(filepath, mm * 0.001 , mm * 0.001, height, width, preserveAspectRatio=False)
      ##c.showPage()
      c.save()
    print "PDF of Image directory created %s" % outputPDFName


def threadExe(location,isbn):
 print 'Executing Thread for location %s book %s\n' % (location,isbn)
 for page in range(maxPages):
  print '%s Page - %i \n' % (isbn,page)
  time.sleep(10)


def createOptionParser():
 parser = OptionParser(usage="Usage: %prog [options]", version="%prog 1.0")
 parser.add_option("-i", "--isbn", dest="isbn", action='store', type='string')
 parser.add_option("-d", action="store_true", dest="download", help="Download the book images", default=False)
 parser.add_option("-c", action="store_true", dest="converToPdf", help="Convert images to pdf file", default=False)
 parser.add_option("-f", "--imageFolder", type="string", help="Image folder to download or to read from.")
 parser.add_option("-p", "--pdfOutputFolder", type="string", help="Pdf output folder.")
 return parser

if __name__ == '__main__':
 parser = createOptionParser()
 (options, args) = parser.parse_args()
 if not options.isbn:
  parser.error("You have to specify the isbn book number")
 
 if options.download or options.converToPdf:
  if not options.imageFolder:
   parser.error("Specify the image folder")
  
  if options.download:
   downloadBookImages(options.imageFolder,options.isbn)
   
  if options.converToPdf:
   if not options.pdfOutputFolder:
    parser.error("Specify the pdf folder")
   print 'Converting to pdf %s, images %s' % (options.pdfOutputFolder,options.imageFolder)
   pdfDirectory(options.pdfOutputFolder + options.isbn + ".pdf" , options.imageFolder)
 else:
  print 'Specify if you would like to download or convert the book or both?'
   
# 
# for book in books:
#  downloadBookImages('c:\\tmp',book)
# 
# for book in books:
#  pdfDirectory('w:\\%s' % book , 'c:\\tmp\\%s.pdf' % book)
# 

No comments: