I am scraping this website and get "title" and "category" as text using .get_text().strip()
.
I have a problem using the same approach for extracting the "author" as text.
data2 = {'url' : [],'title' : [],'category': [],'author': [],
} url_pattern = "https://www.nature.com/nature/articles?searchType=journalSearch&sort=PubDate&year=2018&page={}"
count_min = 1
count_max = 3while count_min <= count_max: print (count_min)url = url_pattern.format(count_min)r = requests.get(url)try: soup = BeautifulSoup(r.content, 'lxml')for links in soup.find_all('article'):data2['url'].append(links.a.attrs['href']) data2['title'].append(links.h3.get_text().strip())data2["category"].append(links.span.get_text().strip()) data2["author"].append(links.find('span', {"itemprop": "name"}).get_text().strip()) #??????except Exception as exc:print(exc.__class__.__name__, exc)time.sleep(0.1)count_min = count_min + 1print ("Fertig.")
df = pd.DataFrame( data2 )
df
df
is supposed to print a table with "author", "category", "title", "url". The print Exception gives me the following hint: AttributeError 'NoneType' object has no attribute 'get_text'
. But instead of the table I get the following message.
ValueError Traceback (most recent call last)
<ipython-input-34-9bfb92af1135> in <module>()29 30 print ("Fertig.")
---> 31 df = pd.DataFrame( data2 )32 df~/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py in __init__(self, data, index, columns, dtype, copy)328 dtype=dtype, copy=copy)329 elif isinstance(data, dict):
--> 330 mgr = self._init_dict(data, index, columns, dtype=dtype)331 elif isinstance(data, ma.MaskedArray):332 import numpy.ma.mrecords as mrecords~/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py in _init_dict(self, data, index, columns, dtype)459 arrays = [data[k] for k in keys]460
--> 461 return _arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype)462 463 def _init_ndarray(self, values, index, columns, dtype=None, copy=False):~/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py in _arrays_to_mgr(arrays, arr_names, index, columns, dtype)6161 # figure out the index, if necessary6162 if index is None:
-> 6163 index = extract_index(arrays)6164 else:6165 index = _ensure_index(index)~/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py in extract_index(data)6209 lengths = list(set(raw_lengths))6210 if len(lengths) > 1:
-> 6211 raise ValueError('arrays must all be same length')6212 6213 if have_dicts:ValueError: arrays must all be same length
How can I improve my code to get the "author" names extracted?