Commit 282753ab authored by Michael Rose's avatar Michael Rose

hpff metadata parser

parent 66186a31
class HpffStory < ApplicationRecord
end
require 'pp'
class IndexHpffMetadata
include Sidekiq::Worker
sidekiq_options :backtrace => true
def perform(page)
prev_url = "https://www.harrypotterfanfiction.com/storysearch.php?&srt=5&sctlab=#{page-1}"
url = "https://www.harrypotterfanfiction.com/storysearch.php?&srt=5&sctlab=#{page}"
logger.info "HPFF: Pulling #{url}..."
response = Faraday.get(url) do |req|
req.headers['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8'
# req.headers['Accept-Encoding'] = 'gzip, deflate, br'
req.headers['Accept-Language'] = 'en-US,en;q=0.9'
req.headers['Referer'] = prev_url
req.headers['User-Agent'] = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36'
end
document = Nokogiri::HTML.parse(response.body)
parsed_doc = parse_stories(document)
HpffStory.transaction do
parsed_doc.each do |story|
story.save
end
end
unless page == 3338
IndexHpffMetadata.perform_async(page+1)
end
end
# private val TITLE_REGEX = Regex("psid=(\\d+)")
# private val AUTHOR_REGEX = Regex("showuid=(\\d+)")
#
# fun parseStory(e: Element): Story {
# val selectFirst = e.selectFirst("p")
# val storyLink = selectFirst.selectFirst("a")
# val titleMatches = TITLE_REGEX.find(storyLink.attr("href"))
# val storyId = titleMatches!!.groupValues[1].toInt()
#
# val authorLink = selectFirst.select("a")[1]
# println(authorLink)
# val authorMatches = AUTHOR_REGEX.find(authorLink.attr("href"))
# val authorId = authorMatches!!.groupValues[1].toInt()
#
# val preStoryData = selectFirst.selectFirst("span").text()
# val summary = e.child(0).ownText()
# val postStoryData = e.child(0).select("span").last().text()
#
# return Story(
# id = storyId,
# title = storyLink.text(),
# author = Author(id = authorId, name = authorLink.text()),
# summary = summary,
# preStoryData = preStoryData,
# postStoryData = postStoryData
# )
def parse_stories(document)
story_rows = document.css('body table tr')
story_rows.map { |row| parse_row(row) }
end
def parse_row(e)
first_p = e.css('p').first
story_link = first_p.css('a')[0]
story_name = story_link.text
story_id= /psid=(\d+)/.match(story_link.attr('href'))[1].to_i
author_link = first_p.css('a')[1]
author_name = author_link.text
author_id = /showuid=(\d+)/.match(author_link.attr('href'))[1].to_i
pre_story_data = first_p.css('span')[0].text
post_story_data = e.css('span').last.children.to_xml
summary = e.children[0].xpath('text()')
s = HpffStory.new
s.hpff_id = story_id
s.title = story_name
s.author_id = author_id
s.author = author_name
s.pre_data = pre_story_data
s.post_data = post_story_data
s.summary = summary
s
end
end
class CreateHpffStories < ActiveRecord::Migration[5.1]
def change
create_table :hpff_stories do |t|
t.integer :hpff_id
t.string :title
t.string :author
t.integer :author_id
t.string :pre_data
t.string :post_data
t.string :summary
t.timestamps
end
end
end
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment