import requests, re
from bs4 import BeautifulSoupdata = []soup = BeautifulSoup(requests.get('https://www.booking.com/searchresults.html?label=gen173nr-1FCAEoggI46AdIM1gEaGyIAQGYATG4ARfIAQzYAQHoAQH4AQKIAgGoAgO4AuS4sJ4GwAIB0gIkYWJlYmZiMWItNWJjMi00M2Y2LTk3MGUtMzI2ZGZmMmIyNzMz2AIF4AIB&aid=304142&dest_id=-2092174&dest_type=city&group_adults=2&req_adults=2&no_rooms=1&group_children=0&req_children=0&nflt=ht_id%3D204&rows=15',headers={'user-agent':'some agent'}).text)num_results = int(re.search(r'\d+',soup.select_one('div:has(+[data-testid="pagination"])').text).group(0))for i in range(0,int(num_results/25)):soup = BeautifulSoup(requests.get(f'https://www.booking.com/searchresults.html?label=gen173nr-1FCAEoggI46AdIM1gEaGyIAQGYATG4ARfIAQzYAQHoAQH4AQKIAgGoAgO4AuS4sJ4GwAIB0gIkYWJlYmZiMWItNWJjMi00M2Y2LTk3MGUtMzI2ZGZmMmIyNzMz2AIF4AIB&aid=304142&dest_id=-2092174&dest_type=city&group_adults=2&req_adults=2&no_rooms=1&group_children=0&req_children=0&nflt=ht_id%3D204&rows=15&offset={int(i*25)}',headers={'user-agent':'some agent'}).text)data.extend([e.select_one('[data-testid="title"]').text for e in soup.select('[data-testid="property-card"]')])data.extend([e.select_one('[class="d8eab2cf7f c90c0a70d3 db63693c62"]') for e in soup.select('[data-testid="property-card"]')])data
I am getting name and reviews for all pages in a single line, i want to get this result in separate columns for names and reviews.
I want to get my result like this:
Actually I couldn't understand your question, what do yo want. If you could show a sample dataframe you want it would be great. But generally you can do it like that. For example in this data latitude longitude is in same column and you can separate them to two columns with split function. Don't forget to add headers.
import requests
from bs4 import BeautifulSoup as bs
from datetime import datetimebase_url = 'https://www.booking.com'
urlss = 'https://www.booking.com/searchresults.html?req_children=0&label=gen173nr-1FCAEoggI46AdIM1gEaGyIAQGYATG4ARfIAQzYAQHoAQH4AQKIAgGoAgO4AuS4sJ4GwAIB0gIkYWJlYmZiMWItNWJjMi00M2Y2LTk3MGUtMzI2ZGZmMmIyNzMz2AIF4AIB&group_children=0&dest_type=city&rows=15&aid=304142&dest_id=-2092174&nflt=ht_id%3D204&req_adults=2&no_rooms=1&group_adults=2'data = []
def pars(url):r = requests.get(url)soup = bs(r.text, 'html.parser')foor = {}try:foor['description'] = soup.find('div', id = 'property_description_content').textfoor['Title'] = soup.find('h2', class_ = 'd2fee87262 pp-header__title').textx = soup.find_all('div', class_ = 'a815ec762e ab06168e66')div_map = soup.select_one('#hotel_sidebar_static_map')if div_map:foor['x_lnge'] = div_map['data-atlas-latlng']for f in range(0, len(x)):foor[f'feature{f}'] =(x[f].text)data.append(foor)except:None
def general():r = requests.get(urlss)soup = bs(r.text, 'html.parser')x = soup.select('header > a')for f in x:urls = base_url + f['href']obj = {}obj['urls'] = urlsprint(urls)pars(urls)f = []
def export_data(data):f = pd.DataFrame(data)f = f.drop_duplicates()presentday = datetime.now()pese = str(presentday)a = str(presentday)[0:10].replace('-', '_')f.to_excel(f'{a}booking.xlsx', index=False)if __name__ == '__main__':general()export_data(data)