Commit cae012f9 authored by David Spencer's avatar David Spencer

Added parsers/

parent 65934afb
SlackBuilds.org related stuff
* review -- Review, approve and push SlackBuilds.org submissions
* review/ -- Review, approve and push SlackBuilds.org submissions
* changelog-parser -- Parse a Slackware-style ChangeLog.txt and output it
as JSON
* parsers/ -- Python modules for parsing Slackware and SBo file formats:
ChangeLog.txt files, SBo .info files, and Slackware package names
* hooks -- git hooks for SlackBuilds.org git committers
* hooks/ -- git hooks for SlackBuilds.org git committers
#!/usr/bin/python3
"""
parse_changelog.py
Parse Slackware (and friends) ChangeLog.txt.
Requires python3, and ply (built with python3 support).
dbs 2018-09-08 Unlicense http://unlicense.org/
Requires: python3, and ply (built with python3 support)
This proof-of-concept reads a ChangeLog on standard input and creates a
Python list of dicts. Each dict is a changelog entry comprising "date"
(as a string), "motd" (the entry's optional message of the day, or a
null string), and "itemlist" (a list of dicts comprising "itemname",
"description" and "securityfix"). This is then written to standard
output as JSON.
[
{
date: "string",
motd: "string",
itemlist:
[
{ itemname:"string", description:"string", securityfix:BOOL },
...
]
},
...
]
Whitespace and newlines are preserved in "motd" and "description",
except that the description's required one- or two-space indentation is
removed.
Provides:
parse_changelog(clpath)
(returns parsed contents of 'clpath' as a list of dicts)
"""
#-----------------------------------------------------------------------
import sys
import json
#-------------------------------------------------------------------------------
#
# Implementation Note
#
# We generate a list of dicts to hold the parsed changelog.
#
# Each dict in the list represents a changelog entry, as follows:
# "date"
# string (in the changelog's original format),
# "motd"
# the entry's optional message of the day, or a null string,
# "itemlist"
# a list of dicts, as follows:
# { "itemname"
# often a package name, but also 'isolinux/initrd.img' etc.
# "description"
# typically something like "Added...", "Updated..." etc,
# "securityfix"
# True or False
# }
#
# Whitespace and newlines are preserved in "motd" and "description",
# except that the description's required one- or two-space indentation is
# removed.
#
#-------------------------------------------------------------------------------
import sys
import logging
from ply import *
#-----------------------------------------------------------------------
#-------------------------------------------------------------------------------
states = (
('header', 'exclusive'),
......@@ -122,17 +128,17 @@ def t_itemdesc_DESCRIPTIONCHUNK(t):
# others
def t_ANY_error(t):
print("We are lost at '%s'" % t.value[0])
logging.error("We are lost at '%s'" % t.value[0])
t.lexer.skip(1)
def t_ANY_eof(t):
return None
#-----------------------------------------------------------------------
#-------------------------------------------------------------------------------
lexer = lex.lex()
#-----------------------------------------------------------------------
#-------------------------------------------------------------------------------
def p_entrylist(p):
......@@ -232,14 +238,21 @@ def p_descseq(p):
def p_error(p):
print("Syntax error in input!")
print(p)
logging.error("Syntax error in input!")
#-----------------------------------------------------------------------
#-------------------------------------------------------------------------------
parser = yacc.yacc()
changelog = parser.parse(sys.stdin.read())
#-----------------------------------------------------------------------
#-------------------------------------------------------------------------------
print(json.dumps(changelog, sort_keys=True, indent=4))
def parse_changelog(clpath):
"""
Function to parse the contents of a ChangeLog.txt file.
Argument:
clpath -- the path to the file
Returns: a list of dicts, as described in the "Implementation Note".
"""
with open(clpath) as changelogtxt:
cldata = parser.parse(changelogtxt.read())
return cldata
"""
parse_info.py
Parse a SlackBuilds.org info file.
Requires python3, and ply (built with python3 support).
dbs 2018-09-08 Unlicense http://unlicense.org/
Provides:
parse_info(infopath)
(returns parsed contents of 'infopath' as a dict)
"""
#-------------------------------------------------------------------------------
#
# Implementation Note
#
# We generate a dict of the .info file's keys and values.
#
# The format of the dict is identical to the .info file's keys and
# values, except that DOWNLOAD*, MD5SUM* and REQUIRES are always lists.
# {
# PRGNAM: "",
# VERSION: "",
# HOMEPAGE: "",
# DOWNLOAD: [ "url"... ],
# MD5SUM: [ "md5sum"... ],
# DOWNLOAD_x86_64: [ "url"... ],
# MD5SUM_x86_64: [ "md5sum"... ],
# REQUIRES: [ "dep"... ]
# MAINTAINER: "",
# EMAIL: ""
# }
#
#-------------------------------------------------------------------------------
import sys
import logging
from ply import *
#-------------------------------------------------------------------------------
states = (
('startmulti', 'exclusive'),
('multi', 'exclusive')
)
tokens = (
"PRGNAM",
"VERSION",
"HOMEPAGE",
"DOWNLOAD",
"MD5SUM",
"DOWNLOAD_x86_64",
"MD5SUM_x86_64",
"REQUIRES",
"MAINTAINER",
"EMAIL",
"EQUALS",
"NEWLINE",
"BACKSLASH",
"DQUOTE",
"BLANKSEQ",
"MD5VALUE",
"STRING",
)
# The order of these token regexes is *very* important, which is why they are
# defined as functions instead of strings (see the ply documentation).
# INITIAL state
def t_INITIAL_PRGNAM(t):
r"PRGNAM"
return t
def t_INITIAL_VERSION(t):
r"VERSION"
return t
def t_INITIAL_HOMEPAGE(t):
r"HOMEPAGE"
return t
def t_INITIAL_DOWNLOAD_x86_64(t):
# this must precede t_DOWNLOAD
r"DOWNLOAD_x86_64"
t.lexer.begin('startmulti')
return t
def t_INITIAL_DOWNLOAD(t):
r"DOWNLOAD"
t.lexer.begin('startmulti')
return t
def t_INITIAL_MD5SUM_x86_64(t):
# this must precede t_MD5SUM
r"MD5SUM_x86_64"
t.lexer.begin('startmulti')
return t
def t_INITIAL_MD5SUM(t):
r"MD5SUM"
t.lexer.begin('startmulti')
return t
def t_INITIAL_REQUIRES(t):
r"REQUIRES"
t.lexer.begin('startmulti')
return t
def t_INITIAL_MAINTAINER(t):
r"MAINTAINER"
return t
def t_INITIAL_EMAIL(t):
r"EMAIL"
return t
def t_INITIAL_EQUALS(t):
r"="
return t
def t_INITIAL_DQUOTE(t):
# don't support single-quoted strings
r'"'
return t
def t_INITIAL_NEWLINE(t):
r"\n"
t.lexer.lineno += 1
return t
def t_INITIAL_STRING(t):
# this can contain whitespace but doesn't support escaped quotes
r'[^"]+'
return t
# startmulti state (this is the context for EQUALS DQUOTE preceding
# 'multistring' or 'multimd5value' -- see the parser below)
def t_startmulti_EQUALS(t):
r"="
return t
def t_startmulti_DQUOTE(t):
# this is always an opening quote and it moves us onto 'multi' state
r'"'
t.lexer.begin('multi')
return t
# multi state (this is the context for whitespace-separated multiple strings
# or md5 values)
def t_multi_BLANKSEQ(t):
# spaces and tabs only -- CR FF VT and Unicode whitespace are not supported
r"[ \t]+"
return t
def t_multi_BACKSLASH(t):
r"\\"
return t
def t_multi_NEWLINE(t):
r"\n"
t.lexer.lineno += 1
return t
def t_multi_DQUOTE(t):
# this is always a closing quote, and it returns us to 'INITIAL' state
r'"'
t.lexer.begin('INITIAL')
return t
def t_multi_MD5VALUE(t):
# case insensitive and exactly 32 chars long
# this must precede t_multi_STRING
r"[0-9a-zA-Z]{32}"
t.value = t.value.lower()
return t
def t_multi_STRING(t):
# this can't contain whitespace and doesn't support escaped quotes
r'[^" \t]+'
return t
# ANY state -- error and eof handling
def t_ANY_error(t):
print("We are lost at '%s'" % t.value[0])
# logging.error("We are lost at '%s'" % t.value[0])
t.lexer.skip(1)
return t
def t_ANY_eof(t):
return None
#-------------------------------------------------------------------------------
lexer = lex.lex()
#-------------------------------------------------------------------------------
def p_infofile(p):
"""
infofile : prgnam version homepage download md5sum download_64 md5sum_64 requires maintainer email
"""
p[0] = { "PRGNAM": p[1],
"VERSION": p[2],
"HOMEPAGE": p[3],
"DOWNLOAD": p[4],
"MD5SUM": p[5],
"DOWNLOAD_x86_64": p[6],
"MD5SUM_x86_64": p[7],
"REQUIRES": p[8],
"MAINTAINER": p[9],
"EMAIL": p[10]
}
def p_prgnam(p):
"""
prgnam : PRGNAM EQUALS DQUOTE STRING DQUOTE NEWLINE
"""
p[0] = p[4]
def p_version(p):
"""
version : VERSION EQUALS DQUOTE STRING DQUOTE NEWLINE
"""
p[0] = p[4]
def p_homepage(p):
"""
homepage : HOMEPAGE EQUALS DQUOTE STRING DQUOTE NEWLINE
"""
p[0] = p[4]
def p_download(p):
"""
download : DOWNLOAD EQUALS multistring NEWLINE
"""
p[0] = p[3]
def p_download_64(p):
"""
download_64 : DOWNLOAD_x86_64 EQUALS multistring NEWLINE
"""
p[0] = p[3]
def p_md5sum(p):
"""
md5sum : MD5SUM EQUALS multimd5value NEWLINE
"""
p[0] = p[3]
def p_md5sum_64(p):
"""
md5sum_64 : MD5SUM_x86_64 EQUALS multimd5value NEWLINE
"""
p[0] = p[3]
def p_requires(p):
"""
requires : REQUIRES EQUALS multistring NEWLINE
"""
p[0] = p[3]
def p_maintainer(p):
"""
maintainer : MAINTAINER EQUALS DQUOTE STRING DQUOTE NEWLINE
"""
p[0] = p[4]
def p_email(p):
"""
email : EMAIL EQUALS DQUOTE STRING DQUOTE NEWLINE
"""
p[0] = p[4]
def p_multistring(p):
"""
multistring : DQUOTE strings DQUOTE
| DQUOTE DQUOTE
"""
if len(p) == 4:
p[0] = p[2]
else:
p[0] = []
def p_strings(p):
"""
strings : strings whitespace STRING
| STRING
"""
if len(p) == 4:
p[0] = p[1]
p[0].append(p[3])
else:
p[0] = [ p[1] ]
def p_multimd5value(p):
"""
multimd5value : DQUOTE md5values DQUOTE
| DQUOTE DQUOTE
"""
if len(p) == 4:
p[0] = p[2]
else:
p[0] = []
def p_md5values(p):
"""
md5values : md5values whitespace MD5VALUE
| MD5VALUE
"""
if len(p) == 4:
p[0] = p[1]
p[0].append(p[3])
else:
p[0] = [ p[1] ]
def p_whitespace(p):
"""
whitespace : BLANKSEQ BACKSLASH NEWLINE BLANKSEQ
| BLANKSEQ BACKSLASH NEWLINE
| BLANKSEQ
"""
p[0] = None
def p_error(p):
# logging.error("Syntax error in input!")
print("Syntax error in input!")
#-------------------------------------------------------------------------------
parser = yacc.yacc()
#-------------------------------------------------------------------------------
def parse_info(infopath):
"""
Function to parse the contents of a SlackBuilds.org info file.
Argument:
infopath -- the path to the file
Returns: a dict, as described in the "Implementation Note".
"""
with open(infopath) as infofile:
Infodict = parser.parse(infofile.read())
return Infodict
#-------------------------------------------------------------------------------
"""
parse_pkgpath.py
Parse a Slackware package name.
Requires python3.
dbs 2018-09-08 Unlicense http://unlicense.org/
Provides:
parse_pkgpath(pkgpath)
(returns parsed pathname 'pkgpath' as a tuple of 7 strings)
"""
import re
import sys
#-------------------------------------------------------------------------------
re_build = re.compile(r"[0-9]+")
re_pkgext = re.compile(r"\.t.z.*")
def parse_pkgpath(pkgpath):
"""
Function to parse a Slackware package name.
Argument: a pathname, or just a filename
* the filename must have at least 4 '-' separated fields
Returns: a tuple of (catnam, prgnam, version, arch, build, tag, pkgtype)
* the prgnam, version, arch and build fields are all required
* None is returned if the filename doesn't have the required fields
* if any other field is omitted, its tuple element will be a null string
"""
pathchunks = pkgpath.split("/")
pkgchunks = pathchunks[-1].split("-")
if len(pkgchunks) < 4:
return None
prgnam = "-".join(pkgchunks[:-3])
catnam = ""
if len(pathchunks) >= 2:
catnam = pathchunks[-2]
# fixup for extra/prgnam/*.txz, {patches,testing}/packages/*.txz
if ( catnam == prgnam or catnam == "packages" ) and len(pathchunks) >= 3:
catnam == pathchunks[-3]
version = pkgchunks[-3]
arch = pkgchunks[-2]
buildtagext = pkgchunks[-1]
build = ""
buildmatch = re_build.match(buildtagext)
if buildmatch:
build = buildmatch[0]
buildtagext = buildtagext[buildmatch.end():]
else:
# if there's no matched build, everything else is probably wrong too
return None
ext = ""
extmatch = re_pkgext.search(buildtagext)
if extmatch:
pkgtype = extmatch[0][1:4]
buildtagext = buildtagext[0:extmatch.start()]
tag = buildtagext
return (catnam, prgnam, version, arch, build, tag, pkgtype)
#-------------------------------------------------------------------------------
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment