Commit 5e603daa by Andy Wingo

Add bogus comment detector.

* tekuti/classifier.scm: New file.
* Makefile.am: Add new file.
* tekuti/comment.scm:
* tekuti/git.scm:
* tekuti/index.scm:
* tekuti/page.scm: Update to include bogus comment detector.
1 parent 42b77efe
......@@ -4,6 +4,7 @@ SOURCES = \
tekuti/base64.scm \
tekuti/boot.scm \
tekuti/cache.scm \
tekuti/classifier.scm \
tekuti/comment.scm \
tekuti/config.scm \
tekuti/filters.scm \
......
......@@ -37,7 +37,8 @@
#:use-module (tekuti match-bind)
#:export (blob->comment comment-sxml-content comment-timestamp
comment-readable-date bad-new-comment-post?
make-new-comment delete-comment))
parse-new-comment make-new-comment delete-comment
compute-legit-comments compute-bogus-comments))
(define *comment-spec*
`((timestamp . ,string->number)))
......@@ -137,31 +138,38 @@
(define de-newline (s///g "[\n\r]" " "))
(define (make-new-comment key title post-data)
(define (parse-new-comment post-data)
(let ((content (assoc-ref post-data "comment"))
(author (assoc-ref post-data "author"))
(email (assoc-ref post-data "email"))
(url (assoc-ref post-data "url")))
(let ((sha1 (with-output-to-blob
(for-each
(lambda (pair)
(format #t "~a: ~a\n" (car pair) (cdr pair)))
`((timestamp . ,(time-second (current-time)))
(author . ,(de-newline author))
(author_email . ,email)
(author_url . ,url)))
(display "\n")
(display content)))
(message (format #f "comment on \"~a\" by ~a" title author)))
(git-update-ref
"refs/heads/master"
(lambda (master)
(git-commit-tree (munge-tree1 master
'create
(list key "comments")
(list sha1 sha1 'blob))
master message #f))
5))))
`((timestamp . ,(time-second (current-time)))
(author . ,(de-newline author))
(author_email . ,email)
(author_url . ,url)
(raw-content . ,content))))
(define (make-new-comment key title comment)
(let ((sha1 (with-output-to-blob
(for-each
(match-lambda
((k . v)
(unless (eq? k 'raw-content)
(format #t "~a: ~a\n" k v))))
comment)
(display "\n")
(display (assq-ref comment 'raw-content))))
(message (format #f "comment on \"~a\" by ~a" title
(assq-ref comment 'author))))
(git-update-ref
"refs/heads/master"
(lambda (master)
(git-commit-tree (munge-tree1 master
'create
(list key "comments")
(list sha1 sha1 'blob))
master message #f))
5)))
(define (delete-comment post id)
(let ((key (post-key post))
......@@ -174,3 +182,45 @@
`(,id))
master message #f))
5)))
(define (compute-legit-comments master-ref)
;; sha1 -> #t
(define legit (make-hash-table))
(pk 'computing-legit)
(for-each
(match-lambda
((post-name post-sha1 'tree)
(for-each
(match-lambda
((comment-name comment-sha1 'blob)
(hash-set! legit comment-sha1 comment-name)))
(git-ls-tree (string-append post-sha1 ":comments") #f))))
(git-ls-tree master-ref #f))
(pk 'done legit))
(define (compute-bogus-comments master-ref legit)
;; sha1 -> #t
(define visited-trees (make-hash-table))
(define bogus (make-hash-table))
(pk 'computing-bogus)
(fold-commits
(lambda (rev commit _)
(pk 'computing-bogus rev)
(for-each
(match-lambda
((post-name post-sha1 'tree)
(unless (hash-ref visited-trees post-sha1)
(hash-set! visited-trees post-sha1 #t)
(for-each
(match-lambda
((comment-name comment-sha1 'blob)
(unless (or (hash-ref legit comment-sha1)
(hash-ref bogus comment-sha1))
(hash-set! bogus comment-sha1 comment-name)))
(_ #f))
(git-ls-tree (string-append post-sha1 ":comments") #f))))
(_ #f))
(git-ls-tree (assq-ref commit 'tree) #f)))
(assq-ref (parse-commit master-ref) 'parent)
#f)
bogus)
......@@ -27,6 +27,7 @@
(define-module (tekuti git)
#:use-module (ice-9 rdelim)
#:use-module (ice-9 popen)
#:use-module (ice-9 textual-ports)
#:use-module (tekuti util)
#:use-module (tekuti config)
#:use-module (tekuti match-bind)
......@@ -44,6 +45,7 @@
git-commit-tree git-rev-list git-revert
munge-tree munge-tree1 parse-commit commit-utc-timestamp
fold-commits
with-output-to-blob with-input-from-blob))
......@@ -77,7 +79,7 @@
(define (prepend-env args)
(if (null? env)
args
(cons "/usr/bin/env" (append env args))))
(cons "/run/current-system/profile/bin/env" (append env args))))
(define (redirect-input args)
(if input-file
(list "/bin/sh" "-c"
......@@ -86,11 +88,7 @@
args))
(let* ((real-args (trc (redirect-input (prepend-env args))))
(pipe (apply open-pipe* OPEN_READ real-args))
(output (begin
(let ((bv (get-bytevector-all pipe)))
(if (eof-object? bv)
""
(utf8->string bv)))))
(output (get-string-all pipe))
(ret (close-pipe pipe)))
(case (status:exit-val ret)
((0) output)
......@@ -335,6 +333,14 @@
"^([^ ]+) (.*)$" (_ k v)
(cons (string->symbol k) v))))))
(define (fold-commits f rev seed)
(let lp ((rev (git-rev-parse rev)) (seed seed))
(if rev
(let ((commit (parse-commit rev)))
(lp (assq-ref commit 'parent)
(f rev commit seed)))
seed)))
(define (commit-utc-timestamp commit)
(match-bind
"^(.*) ([0-9]+) ([+-][0-9]+)" (assq-ref (parse-commit commit) 'committer)
......
......@@ -33,46 +33,48 @@
#:use-module (tekuti post)
#:use-module (tekuti tags)
#:use-module (tekuti cache)
#:use-module (tekuti classifier)
#:export (maybe-reindex read-index update-index))
;; Additionally an index has an "index" field, indicating the commit
;; that it was saved in, and a "master" field, indicating the commit
;; that it indexes.
(define index-specs
`((posts ,reindex-posts ,write-hash ,read-hash)
`((master #f ,write ,read)
(posts ,reindex-posts ,write-hash ,read-hash)
(posts-by-date ,reindex-posts-by-date ,write ,read)
(tags ,reindex-tags ,write-hash ,read-hash)
(cache ,(lambda _ (make-empty-cache)) ,(lambda (x) #f) ,(lambda () '()))))
(legit-comments ,reindex-legit-comments ,write-hash ,read-hash)
(bogus-comments ,reindex-bogus-comments ,write-hash ,read-hash)
(classifier ,reindex-classifier #f #f)
(cache ,(lambda _ (make-empty-cache)) #f #f)))
(define (reindex oldindex master)
;; Leave off "index" field.
(with-time-debugging
(fold (lambda (pair index)
(acons (car pair) ((cadr pair) oldindex index)
index))
(fold (lambda (spec index)
(match spec
((key reindex write read)
(acons key (with-time-debugging (begin (pk key) (reindex oldindex index))) index))))
(acons 'master master '())
index-specs)))
(define (assoc-list-ref alist key n default)
(let ((l (assoc key alist)))
(if l (list-ref l n) default)))
(define (index->blob key value)
(with-output-to-blob
((assoc-list-ref index-specs key 2 write) value)))
(define (blob->index name sha1)
(with-input-from-blob
sha1
((assoc-list-ref index-specs (string->symbol name) 3 read))))
;; Skip past "master" as we handle that one specially.
(match index-specs
((('master . _) . specs) specs)))))
(define (write-index index oldref)
(let ((new (git-commit-tree
(git-mktree
(let lp ((index index))
(cond
((null? index) '())
((eq? (caar index) 'index) (lp (cdr index)))
(else (cons (list (caar index)
(index->blob (caar index) (cdar index))
'blob)
(lp (cdr index)))))))
(match index
(() '())
(((k . v) . index)
(match (assq k index-specs)
((_ reindex write read)
(if write
(cons (list k (with-output-to-blob (write v)) 'blob)
(lp index))
(lp index)))
(_ (lp index)))))))
oldref "reindex\n"
(commit-utc-timestamp (assq-ref index 'master)))))
(or (false-if-git-error
......@@ -81,19 +83,26 @@
new))
(define (read-index)
(pk 'reading-index)
(match (false-if-git-error (git-rev-parse "refs/heads/index"))
(#f (maybe-reindex '()))
(ref
(let ((dents (git-ls-tree ref #f)))
(if (and-map (lambda (spec)
(assoc (symbol->string (car spec)) dents))
index-specs)
(acons 'index ref
(map (lambda (dent)
(cons (string->symbol (car dent))
(blob->index (car dent) (cadr dent))))
dents))
(maybe-reindex (acons 'index ref '())))))))
(fold (lambda (spec index)
(match spec
((key reindex write read)
(pk 'read-index-key key)
(acons key
(cond
((and read (assoc (symbol->string key) dents))
=> (match-lambda
((_ sha1 'blob)
(with-input-from-blob sha1 (read)))))
(else
(reindex '() index)))
index))))
`((index . ,ref))
index-specs)))))
(define (maybe-reindex old-index)
(let ((master (git-rev-parse "refs/heads/master")))
......
......@@ -30,6 +30,7 @@
#:use-module (tekuti git)
#:use-module (tekuti post)
#:use-module (tekuti comment)
#:use-module (tekuti classifier)
#:use-module (web uri)
#:use-module (web request)
#:use-module (tekuti request)
......@@ -208,13 +209,27 @@
(cond
((post-from-key index (make-post-key year month day name))
=> (lambda (post)
(cond
((bad-new-comment-post? data)
=> (lambda (reason)
(respond `((p "Bad post data: " ,(pk reason))))))
(else
(let ((comment (make-new-comment (post-key post) (post-title post)
data)))
(let ((comment (parse-new-comment data)))
(cond
((bad-new-comment-post? data)
=> (lambda (reason)
(respond `((p "Bad post data: " ,(pk reason))))))
((comment-is-bogus? index comment)
(respond `((p "Comment appears to be bogus; ignoring.")
(p "I'm testing out a new automated bogus "
"comment detector. If you feel your comment "
"was caught unfairly, tweet it to me or send "
"it by email. Or press back and reword it.")
(p "If you are a spammer, note that I fixed "
"the comment renderer to properly add "
(tt "rel='external nofollow'") " on all "
"links in comments. Go take a look at any "
"comment with a link to see for yourself. "
"Trying to linkbomb this site probably won't "
"give you any link juice so it's not worth "
"the trouble to either one of us :)"))))
(else
(make-new-comment (post-key post) (post-title post) comment)
;; nb: at this point, `post' is out-of-date
(respond `((p "Comment posted, thanks."))
#:redirect (post-url post #:fragment "comments")
......
Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!