Switched the XML parser package and enabled modules on the project

Code Express · Code Express · commit 91bd9ddd9670 · 2021-10-13T01:54:54.000-07:00
diff --git a/README.md b/README.md
@@ -1,11 +1,110 @@
 # Web Plucker
 
-`webpluck` scrapes a specific values from a web page. It works as a standalone
-binary as well as in a API mode.
+`webpluck` scrapes a specific values from a web page. It works as a
+standalone binary as well as in a API mode.
 
-`webpluck` takes the following as input:
+## Download
+
+### Latest Releases
+`webpluck` is available for 64 bit linux, OS X and Windows systems.
+Latest versions can be downloaded from the
+[Release](https://github.com/codeexpress/webpluck/releases) tab above. This is the preferred way.
+
+### Build from source
+This is a golang project. Assuming you have golang compiler installed,
+the following will build the binary from scratch
+```
+$ git clone https://github.com/codeexpress/webpluck
+$ cd webpluck
+$ go get
+$ go build -o webpluck main.go logger.go webpluck.go
+```
+
+## Usage
+`webpluck` takes the following input:
  - URL of the webpage
  - XPATH of the element
  - optional regex to further narrow the selection
 
 and outputs the selected value.
+
+### 1. URL of the webpage
+This is the link of the webpage which has the desired information we want to extract.
+
+For example, if we want to scrape the founders of the StackOverflow website from its company page, the URL is:
+https://stackoverflow.com/company. The desired value we want to extract is: **Joel Spolsky and Jeff Atwood**
+
+<img width="507" alt="baseUrl" src="https://user-images.githubusercontent.com/14211134/81618604-5335bf00-9405-11ea-8b8c-ddb75e194983.png">
+
+### 2. XPATH of the element
+This is the link of the **xpath** of the element on the page that contains the required information. A good way to get this information is to (on a chrome browser):
+ - Right click on the place were the information is present
+ - Click "Inspect" to open the Chrome developer tools window with the element highligted
+ - On the highlighed value in the HTML source code, `Right click -> Copy -> Copy xpath`
+ - The copied value is the xpath we need
+
+
+Get xpath Step 1             |  Get xpath Step 2
+:-------------------------:|:-------------------------:
+<img width="352" alt="Screen Shot 2020-05-12 at 4 06 13 AM" src="https://user-images.githubusercontent.com/14211134/81619156-8d539080-9406-11ea-99bf-17e9e4da7e87.png" > | <img width="355" alt="Screen Shot 2020-05-12 at 4 08 02 AM" src="https://user-images.githubusercontent.com/14211134/81619157-8e84bd80-9406-11ea-8941-b6c6e0dfab46.png">
+
+The xpath in example above comes out to be: ```//*[@id="content"]/section[3]/ol/li[1]/ol/li[2]/text()```
+
+### 3. `regex` to pluck the right value
+
+Note the the xpath above leads us to the value: *Joel Spolsky and Jeff Atwood launch Stack Overflow*
+
+Since we want to trim that down further, we'll provide a regex value to extract just the names.
+
+This regex will fetch just the names (the value in parenthesis):
+``` ^(*.) launch .* ```
+
+## Sample hosted invocation
+
+`webpluck` can be run as a standalone binary. To extract the names using the three params we just obtained, copy the `targets.yml` file and populate it with the parameters. The resulting `targets.yml` should look like this:
+
+```yaml
+targetList:
+  -   name: stackoverflow_founders
+      baseUrl: https://stackoverflow.com/company
+      xpath:  //*[@id="content"]/section[3]/ol/li[1]/ol/li[2]/text()
+      regex: ^(.*) launch .*
+```
+
+Now invoke webpluck as follows and obtain the answer:
+```bash
+$ ./webpluck_osx -f /path/to/targets.yml
+{
+  "stackoverflow_founders": "Joel Spolsky and Jeff Atwood"
+}
+```
+
+## Sample API invocation
+
+`webpluck` can be run in server mode as well. Thereafter, clients written in other programming languages can scrape web pages using the `webpluck` API over the network.
+
+To run `webpluck` in server mode listening on localhost on 8080:
+```bash
+$ ./webpluck -p 8080
+```
+
+An instance of `webpluck` API is running at `https://api.code.express/webpluck/`. You can use that for your light extraction needs. If your load is heavy, consider spinning your own server running `webpluck`
+
+Armed with the knowledge of `baseUrl`, `xpath` and `regex`, we can now call the API endpoint by POSTing these three params:
+Example `curl` invocation for the server mode:
+```bash
+curl 'https://api.code.express/webpluck/' \
+      --data-urlencode 'baseUrl=https://stackoverflow.com/company' \
+      --data-urlencode 'xpath=//*[@id="content"]/section[3]/ol/li[1]/ol/li[2]/text()' \
+      --data-urlencode 'regex=^(.*) launch .*' -g
+```
+
+The result from the API is as follows. The `pluckedData` field returns the value extracted:
+```json
+{
+  "baseUrl": "https://stackoverflow.com/company",
+  "pluckedData": "Joel Spolsky and Jeff Atwood",
+  "regex": "^(.*) launch .*",
+  "xpath": "//*[@id=\"content\"]/section[3]/ol/li[1]/ol/li[2]/text()"
+}
+```
diff --git a/cmd/main.go b/cmd/main.go
@@ -10,23 +10,32 @@ import (
 	"os"
 	"strconv"
 
+	"github.com/codeexpress/webpluck/logger"
+	"github.com/codeexpress/webpluck/webpluck"
 	"gopkg.in/yaml.v2"
 )
 
-const (
-	UserAgent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.96 Safari/537.36"
-)
-
 var (
 	//argument flags
 	filePtr       *string
 	outputTextPtr *bool
 	serverModePtr *int
 )
 
+type targetList struct {
+	TargetList []dataLocation `yaml:"targetList"`
+}
+
+type dataLocation struct {
+	Name    string `yaml:"name"`
+	BaseUrl string `yaml:"baseUrl"`
+	Xpath   string `yaml:"xpath"`
+	Regex   string `yaml:"regex"`
+}
+
 func main() {
 	initFlags()
-	initLogger()
+	logger.InitLogger()
 
 	serverMode := isFlagPassed("p")
 
@@ -42,7 +51,7 @@ Listens on a port and answers online queries of type:
 http://localhost:8080?baseUrl="example.com"&xpath="/html/body"&regex=""
 */
 func serveApi() {
-	logIt("Started HTTP server on localhost: "+strconv.Itoa(*serverModePtr), true)
+	logger.LogIt("Started HTTP server on localhost: "+strconv.Itoa(*serverModePtr), true)
 
 	http.HandleFunc("/", handleHttp)
 	fmt.Println(http.ListenAndServe(":"+strconv.Itoa(*serverModePtr), nil))
@@ -59,22 +68,22 @@ func handleHttp(w http.ResponseWriter, req *http.Request) {
 	results["xpath"] = xpath
 	results["regex"] = regex
 
-	logIt(getIp(req) + "  " + req.Header.Get("User-Agent") + " Request: ")
-	logIt(results)
+	logger.LogIt(getIp(req) + "  " + req.Header.Get("User-Agent") + " Request: ")
+	logger.LogIt(results)
 	defer func() { // in case of panic
 		if err := recover(); err != nil {
-			http.Error(w, "my own error message", http.StatusInternalServerError)
+			http.Error(w, "Internal Server Error", http.StatusInternalServerError)
 			fmt.Fprintf(w, "Webpluck encountered an error. Make sure that the baseUrl is a valid URL and xpath and regex are valid\n")
-			fmt.Fprintf(w, "Error encountered is:\n%s\n", err)
-			logIt(err)
+			fmt.Fprintf(w, "Error description:\n%s\n", err)
+			logger.LogIt(err)
 		}
 	}()
-	text := ExtractTextFromUrl(baseUrl, xpath, regex)
+	text := webpluck.ExtractTextFromUrl(baseUrl, xpath, regex)
 	results["pluckedData"] = text
 	jsonString, err := json.MarshalIndent(results, "", "  ")
 	check(err)
 	fmt.Fprintf(w, string(jsonString))
-	logIt("Answer: " + text)
+	logger.LogIt("Answer: " + text)
 }
 
 func pluckFromFile() {
@@ -88,15 +97,15 @@ func pluckFromFile() {
 	results := make(map[string]string)
 
 	for _, t := range list.TargetList {
-		text := ExtractTextFromUrl(t.BaseUrl, t.Xpath, t.Regex)
+		text := webpluck.ExtractTextFromUrl(t.BaseUrl, t.Xpath, t.Regex)
 		results[t.Name] = text
 		if *outputTextPtr { // if output to text (t) flag is set
 			fmt.Println(t.Name + ": " + text)
 		}
 	}
 
-	logIt("Webpluck invoked. Reading from file: " + *filePtr)
-	logIt(results)
+	logger.LogIt("Webpluck invoked. Reading from file: " + *filePtr)
+	logger.LogIt(results)
 
 	if !*outputTextPtr { // default case is to print in JSON
 		jsonString, err := json.MarshalIndent(results, "", "  ")
@@ -143,12 +152,12 @@ func check(e error) {
 // Get IP address of the incoming HTTP request based on forwarded-for
 // header (present in case of proxy). If not, use the remote address
 func getIp(req *http.Request) string {
-	forwarded := req.Header.Get("X-FORWARDED-FOR")
-	var addr string
-	if forwarded != "" {
-		addr = forwarded
+	forwardedIp := req.Header.Get("X-Forwarded-For")
+	if forwardedIp != "" {
+		return forwardedIp
 	}
-	addr = req.RemoteAddr
+
+	addr := req.RemoteAddr
 	ip, _, _ := net.SplitHostPort(addr)
 	return ip
 }
diff --git a/go.mod b/go.mod
@@ -0,0 +1,15 @@
+module github.com/codeexpress/webpluck
+
+go 1.17
+
+require (
+	github.com/antchfx/htmlquery v1.2.4
+	golang.org/x/net v0.0.0-20211011170408-caeb26a5c8c0
+	gopkg.in/yaml.v2 v2.4.0
+)
+
+require (
+	github.com/antchfx/xpath v1.2.0 // indirect
+	github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e // indirect
+	golang.org/x/text v0.3.6 // indirect
+)
diff --git a/go.sum b/go.sum
@@ -0,0 +1,23 @@
+github.com/antchfx/htmlquery v1.2.4 h1:qLteofCMe/KGovBI6SQgmou2QNyedFUW+pE+BpeZ494=
+github.com/antchfx/htmlquery v1.2.4/go.mod h1:2xO6iu3EVWs7R2JYqBbp8YzG50gj/ofqs5/0VZoDZLc=
+github.com/antchfx/xpath v1.2.0 h1:mbwv7co+x0RwgeGAOHdrKy89GvHaGvxxBtPK0uF9Zr8=
+github.com/antchfx/xpath v1.2.0/go.mod h1:i54GszH55fYfBmoZXapTHN8T8tkcHfRgLyVwwqzXNcs=
+github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e h1:1r7pUrabqp18hOBcwBwiTsbnFeTZHV9eER/QT5JVZxY=
+github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc=
+golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
+golang.org/x/net v0.0.0-20200421231249-e086a090c8fd/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A=
+golang.org/x/net v0.0.0-20211011170408-caeb26a5c8c0 h1:qOfNqBm5gk93LjGZo1MJaKY6Bph39zOKz1Hz2ogHj1w=
+golang.org/x/net v0.0.0-20211011170408-caeb26a5c8c0/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
+golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
+golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
+golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
+golang.org/x/text v0.3.6 h1:aRYxNxv6iGQlyVaZmk6ZgYEDa+Jg18DxebPSrd6bg1M=
+golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
+golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
+gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
+gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
+gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY=
+gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ=
diff --git a/logger/logger.go b/logger/logger.go
@@ -1,4 +1,4 @@
-package main
+package logger
 
 import (
 	"encoding/json"
@@ -14,7 +14,7 @@ var (
 )
 
 // Initializing the logger and customizing prefix
-func initLogger() {
+func InitLogger() {
 	f, err := os.OpenFile("run.log", os.O_RDWR|os.O_CREATE|os.O_APPEND, 0666)
 	if err != nil {
 		panic(err)
@@ -26,7 +26,7 @@ func initLogger() {
 // Logs to log file
 // takes generic object and then based on the type of object,
 // logs is in appropriate style.
-func logIt(val interface{}, console ...bool) {
+func LogIt(val interface{}, console ...bool) {
 	// if console is passed, print to stdout as well
 	if len(console) != 0 {
 		fmt.Println(val)
diff --git a/targets.yml b/targets.yml
@@ -1,13 +1,17 @@
 targetList:
-  -   name: example.com
-      baseUrl: http://example.com/
-      xpath:   /html/body/div/p[2]/a/@href
-      regex:   ^(?:https?://)?(?:[^@\n]+@)?([^:/\n]+)
-  -   name: stackoverflow_founders
-      baseUrl: https://stackoverflow.com/company
-      xpath:  //*[@id="content"]/section[3]/ol/li[1]/ol/li[2]
-      regex: ^(.*) launch .*
-  -   name: stackoverflow_example_without_regex
+  -   name: ufl_ms_cs_fall
+      baseUrl: https://www.cise.ufl.edu/admissions/graduate/
+      xpath:  //*[@id="tablepress-14"]/tbody/tr[2]/td[2]
+      regex:
+  -   name: ufl_ms_cs_spring
+      baseUrl: https://www.cise.ufl.edu/admissions/graduate/
+      xpath:  //*[@id="tablepress-15"]/tbody/tr[1]/td[2]
+      regex:
+  -   name: ncsu_cs_ms
+      baseUrl: https://www.csc.ncsu.edu/academics/graduate/admdeadlines.php
+      xpath:  //*[@id="main"]/ol/li[1]/ul/li[1]/strong
+      regex:
+  -   name: stackoverflow_extract_asked_date_of_a_question
       baseUrl: https://stackoverflow.com/questions/18361750
       xpath: //*[@id="question"]/div[2]/div[2]/div[3]/div/div[3]/div/div[1]
       regex:
diff --git a/webpluck/webpluck.go b/webpluck/webpluck.go
@@ -1,24 +1,18 @@
-package main
+package webpluck
 
 import (
 	"io/ioutil"
 	"net/http"
 	"regexp"
 	"strings"
 
-	"gopkg.in/xmlpath.v2"
+	"github.com/antchfx/htmlquery"
+	"golang.org/x/net/html"
 )
 
-type targetList struct {
-	TargetList []dataLocation `yaml:"targetList"`
-}
-
-type dataLocation struct {
-	Name    string `yaml:"name"`
-	BaseUrl string `yaml:"baseUrl"`
-	Xpath   string `yaml:"xpath"`
-	Regex   string `yaml:"regex"`
-}
+const (
+	UserAgent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.96 Safari/537.36"
+)
 
 /* Params:
    url   - URL of the page to be scraped
@@ -35,23 +29,24 @@ func ExtractTextFromUrl(
 	text := ""
 
 	//logIt("Fetch from URL: "+url, 1)
-	parsedHtml := fetchUrl(url) // returns a xmlpath.Node object
-	path := xmlpath.MustCompile(xpath)
-	value, ok := path.String(parsedHtml)
-	if ok {
-		if regex != "" {
-			// try applying regex
-			regexMatch := regexp.MustCompile(regex)
-			text = regexMatch.FindStringSubmatch(string(value))[1]
-		} else {
-			text = value // no regex, the xpath element is the value
-		}
+	parsedHtml := fetchUrl(url) // returns a xmlquery.Node object
+
+	node := htmlquery.FindOne(parsedHtml, xpath)
+	value := htmlquery.InnerText(node)
+
+	if regex != "" {
+		// try applying regex
+		regexMatch := regexp.MustCompile(regex)
+		text = regexMatch.FindStringSubmatch(string(value))[1]
+	} else {
+		text = value // no regex, the xpath element is the value
 	}
+
 	return strings.TrimSpace(text)
 }
 
 // does a HTTP GET and returns the HTML body for that URL
-func fetchUrl(url string) *xmlpath.Node {
+func fetchUrl(url string) *html.Node {
 	client := &http.Client{}
 	req, err := http.NewRequest("GET", url, nil)
 	if err != nil {
@@ -66,8 +61,7 @@ func fetchUrl(url string) *xmlpath.Node {
 
 	html, _ := ioutil.ReadAll(resp.Body)
 	htmlStr := string(html)
-
-	parsedHtml, err := xmlpath.ParseHTML(strings.NewReader(htmlStr))
+	parsedHtml, err := htmlquery.Parse(strings.NewReader(htmlStr))
 	if err != nil {
 		panic(err)
 	}