Commit 2848d971 authored by Michael Rose's avatar Michael Rose

hpff: chapter metadata indexer

parent 65da60b8
class HpffChapter < ApplicationRecord
end
require 'pp'
class IndexHpffChapterMetadata
include Sidekiq::Worker
sidekiq_options :backtrace => true
def perform(story_id)
prev_url = "https://www.harrypotterfanfiction.com/storysearch.php?&srt=5&sctlab=1"
story = HpffStory.find_by(id: story_id)
url = "https://www.harrypotterfanfiction.com/viewstory.php?psid=#{story.hpff_id}"
logger.info "HPFF: Pulling chapter #{url}..."
response = Faraday.get(url) do |req|
req.headers['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8'
# req.headers['Accept-Encoding'] = 'gzip, deflate, br'
req.headers['Accept-Language'] = 'en-US,en;q=0.9'
req.headers['Referer'] = prev_url
req.headers['User-Agent'] = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36'
end
if response.body.include? 'ERROR locating story meta '
story.fetch_status = 'MISSING'
story.save!
end
document = Nokogiri::HTML.parse(response.body.force_encoding('iso-8859-1').encode('utf-8'))
parsed_doc = parse_chapters(story.hpff_id, document)
HpffChapter.transaction do
parsed_doc.each do |chapter|
chapter.save
end
end
unless story_id == 83478
IndexHpffChapterMetadata.perform_async(story_id+1)
end
end
# private val TITLE_REGEX = Regex("psid=(\\d+)")
# private val AUTHOR_REGEX = Regex("showuid=(\\d+)")
#
# fun parseStory(e: Element): Story {
# val selectFirst = e.selectFirst("p")
# val storyLink = selectFirst.selectFirst("a")
# val titleMatches = TITLE_REGEX.find(storyLink.attr("href"))
# val storyId = titleMatches!!.groupValues[1].toInt()
#
# val authorLink = selectFirst.select("a")[1]
# println(authorLink)
# val authorMatches = AUTHOR_REGEX.find(authorLink.attr("href"))
# val authorId = authorMatches!!.groupValues[1].toInt()
#
# val preStoryData = selectFirst.selectFirst("span").text()
# val summary = e.child(0).ownText()
# val postStoryData = e.child(0).select("span").last().text()
#
# return Story(
# id = storyId,
# title = storyLink.text(),
# author = Author(id = authorId, name = authorLink.text()),
# summary = summary,
# preStoryData = preStoryData,
# postStoryData = postStoryData
# )
def parse_chapters(story_id, document)
rows = document.css('body table')[2].css('tr').drop(1)
rows.map { |row| parse_row(story_id, row) }
end
def parse_row(story_id, e)
first_p = e.css('p').first
chapter_link = e.css('b a')
chapter_name = chapter_link.text
chapter_id = /chapterid=(\d+)/.match(chapter_link.attr('href'))[1].to_i
chapter_blurb = e.css('p.blurb').children.to_xhtml
wordcount = e.css('td')[2].text.to_i
s = HpffChapter.new
s.story_id = story_id
s.chapter_id = chapter_id
s.chapter_name = chapter_name
s.chapter_blurb = chapter_blurb
s.wordcount = wordcount
s
end
end
class CreateHpffChapters < ActiveRecord::Migration[5.1]
def change
create_table :hpff_chapters do |t|
t.integer :story_id
t.integer :chapter_id
t.string :chapter_name
t.string :chapter_blurb
t.integer :wordcount
t.timestamps
end
end
end
class AddStatusToHpffStory < ActiveRecord::Migration[5.1]
def change
add_column :hpff_stories, :fetch_status, :string
end
end
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment