Commit 69603324 authored by NourEddineX's avatar NourEddineX

added main spider

parent 36b1dc03
import scrapy
import csv
import unicodedata
from scrapy.selector import Selector
class IMDBSpider(scrapy.Spider):
name = "imdb"
def joinvars(self,var):
joined = ' - '.join(var)
return joined
def start_requests(self):
urls = []
txt = open('ids.txt','r').read().splitlines()
for i in txt:
urls.append('https://www.imdb.com/title/%s' % (i,))
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
tvmovie = response.xpath('//*[@id="title-overview-widget"]//*/div/a[contains(text(),\'TV\')]//text()').extract()
if len(tvmovie) > 0:
pass
else:
page = response.url.split("/")[-2]
filename = 'imdb.csv'
try:
title = unicodedata.normalize("NFKD",response.xpath('//*[@id="title-overview-widget"]//*/div/h1//text()').extract()[0])
except:
title = ''
try:
year = response.xpath('//*[@id="titleYear"]/a//text()').extract()[0]
except:
year = ''
try:
director = response.xpath('//*[@id="title-overview-widget"]//*/div/span[@itemprop="director"]/a/span//text()').extract()
except:
director = ''
try:
writers = response.xpath('//*[@id="title-overview-widget"]//*/div/span[@itemprop="creator"]/a/span//text()').extract()
except:
writers = ''
try:
stars = response.xpath('//*[@id="title-overview-widget"]//*/div/span[@itemprop="actors"]/a/span//text()').extract()
except:
stars = ''
try:
keywords = response.xpath('//*[@id="titleStoryLine"]/div/a/span[@itemprop="keywords"]//text()').extract()
except:
keywords = ''
try:
rating = response.xpath('//*[@id="title-overview-widget"]//*/div/strong/span[@itemprop="ratingValue"]//text()').extract()
except:
rating = ''
try:
country = response.xpath('//*[@id="titleDetails"]/div/h4[contains(text(),"Country")]/../a/text()').extract()
except:
country = ''
try:
productionco = response.xpath('//*[@id="titleDetails"]/div/h4[contains(text(),"Production Co")]/../span/a/span/text()').extract()
except:
productionco = ''
try:
genre = response.xpath('//*[@id="titleStoryLine"]/div[@itemprop="genre"]/a//text()').extract()
except:
language = response.xpath('//*[@id="titleDetails"]/div/h4[contains(text(),"Language")]/../a//text()').extract()
try:
tagline = response.xpath('//*[@id="titleStoryLine"]//*/h4[contains(text(),"Taglines")]/..//text()').extract()[2].strip()
except:
tagline = ''
try:
releasedate = response.xpath('//*[@id="titleDetails"]//*/h4[contains(text(),"Release Date")]/..//text()').extract()[2].strip()
except:
releasedate = ''
try:
budget = response.xpath('//*[@id="titleDetails"]//*/h4[contains(text(),"Budget")]/..//text()').extract()[2].strip()
except:
budget = ''
try:
runtime = response.xpath('//*[@id="titleDetails"]//*/h4[contains(text(),"Runtime")]/../time//text()').extract()[0]
except:
runtime = ''
try:
soundmix = response.xpath('//*[@id="titleDetails"]//*/h4[contains(text(),"Sound Mix")]/../a//text()').extract()
except:
soundmix = ''
try:
summary = response.css('div.summary_text::text').extract()[0].strip()
except:
summary = ''
with open(filename, 'a') as csvf:
writer = csv.writer(csvf)
writer.writerow([title,year,self.joinvars(director),self.joinvars(writers),self.joinvars(stars),self.joinvars(keywords), self.joinvars(rating), self.joinvars(country),self.joinvars(productionco) , tagline, releasedate, budget, runtime, self.joinvars(soundmix),summary] )
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment