I want to crawl all the table entries(table that describes the S/No. , Document No., etc.) from the following website and write it to excel. So far, I am able to crawl the data from the first page (10 entries) only. Can anyone please help me with the python piece of code to crawl the data from first to last page on this website.
Website: https://www.gebiz.gov.sg/scripts/main.do?sourceLocation=openarea&select=tenderId
My python code:
from bs4 import BeautifulSoup
import requests
import sys
import mechanize
import pprint
import re
import csv
import urllib
import urllib2browser = mechanize.Browser()
browser.set_handle_robots(False)
url = 'https://www.gebiz.gov.sg/scripts/main.do?sourceLocation=openarea&select=tenderId'
response = browser.open(url)
html_doc = response.read()rows_list = []
table_dict = {}soup = BeautifulSoup(html_doc)table = soup.find("table", attrs={"width": "100%", "border": "0", "cellspacing": "2", "cellpadding": "3", "bgcolor": "#FFFFFF"})
tr_elements = table.find_all("tr", class_=re.compile((ur'(row_even|row_odd|header_subone)')))table_rows = []for i in range(0, len(tr_elements)):tr_element = tr_elements[i]td_elements_in_tr_element = tr_element.find_all("td")rows_list.append([])for j in range(0, len(td_elements_in_tr_element)):td_element = td_elements_in_tr_element[j]table_elements_in_td_element = td_element.find_all("table")if len(table_elements_in_td_element) > 0:continuerows_list[i].append(td_element.text)pprint.pprint(len(table_elements_in_td_element))
pprint.pprint(rows_list)rows_list.remove([])for row in rows_list:
table_dict[row[0]] = {#'S/No.' : row[1],'Document No.': row[1] + row[2],'Tenders and Quotations': row[3] + row[4],'Publication Date': row[5],'Closing Date': row[6],'Status': row[7]
}pprint.pprint(table_dict)with open('gebiz.csv', 'wb') as csvfile:csvwriter = csv.writer(csvfile, dialect='excel')for key in sorted(table_dict.iterkeys()):csvwriter.writerow([table_dict[key]['Document No.'], table_dict[key]['Tenders and Quotations'], table_dict[key]['Publication Date'], table_dict[key]['Closing Date'], table_dict[key]['Status']])
Every help from your side will be highly appreciated.