Links

Parsing Templates

Nimble Labs Beta Feature
Parsing templates allow users to accurately extract specific snippets or key data points from a webpage. By using industry-standard CSS selectors, parsing templates can extract data precisely from almost any webpage.
Parsing templates extract key data points from a webpage
Parsing templates also come with a variety of options designed to make data extraction a breeze, such as built-in support for tables, JSON output, and custom objects. They make use of the same framework used by other popular parsing libraries such as Beautiful Soup, making for an easy and familiar experience.
When using parsing templates, it's important to monitor changes in the source webpage structure and its effect on parsing templates. Nimble does not maintain or update custom parsing templates.

Getting started

To use parsing templates, first set parse to true in order to enable page parsing, set format to json, and then define your parsing template by adding a parser parameter:
cURL
Python
Node.js
Go
curl -X POST 'https://api.webit.live/api/v1/realtime/web' \
--header 'Authorization: Basic <credential string>' \
--header 'Content-Type: application/json' \
--data-raw '{
"url": "https://www.example.com",
"render": true,
"format": "json",
"parse": true,
"parser": {
"template_name": {
"type": "item",
"selectors": [".css-selector"],
"extractor": "text"
}
}
}'
import requests
url = 'https://api.webit.live/api/v1/realtime/web'
headers = {
'Authorization': 'Basic <credential string>',
'Content-Type': 'application/json'
}
data = {
"url": "https://www.example.com",
"render": True,
"format": "json",
"parse": True,
"parser": {
"template_name": {
"type": "item",
"selectors": [".css-selector"],
"extractor": "text"
}
}
}
response = requests.post(url, headers=headers, json=data)
print(response.status_code)
print(response.json())
const axios = require('axios');
const url = 'https://api.webit.live/api/v1/realtime/web';
const headers = {
'Authorization': 'Basic <credential string>',
'Content-Type': 'application/json'
};
const data = {
"url": "https://www.example.com",
"render": true,
"format": "json",
"parse": true,
"parser": {
"template_name": {
"type": "item",
"selectors": [".css-selector"],
"extractor": "text"
}
}
};
axios.post(url, data, { headers })
.then(response => {
console.log(response.status);
console.log(response.data);
})
.catch(error => {
console.error(error);
});
package main
import (
"bytes"
"encoding/base64"
"fmt"
"net/http"
"encoding/json"
)
func main() {
url := "https://api.webit.live/api/v1/realtime/web"
payload := []byte(`{
"url": "https://www.example.com",
"render": true,
"format": "json",
"parse": true,
"parser": {
"template_name": {
"type": "item",
"selectors": [".css-selector"],
"extractor": "text"
}
}
}`)
headers := map[string]string{
"Authorization": "Basic <credential string>",
"Content-Type": "application/json",
}
req, err := http.NewRequest("POST", url, bytes.NewBuffer(payload))
if err != nil {
fmt.Println(err)
return
}
for key, value := range headers {
req.Header.Set(key, value)
}
client := &http.Client{}
resp, err := client.Do(req)
if err != nil {
fmt.Println(err)
return
}
defer resp.Body.Close()
fmt.Println(resp.StatusCode)
// Read the response body if needed
// body, err := ioutil.ReadAll(resp.Body)
// fmt.Println(string(body))
}
Be sure to set the parse to true and format to json parameters when sending a request with parsing templates!
The name of the template becomes the title of the property returned in the response. For example, the response to the above parser would look like:
{
...
"template_name": "extracted data here"
}
Multiple, independent parsing templates can also be sent in a single request, as in the example below.
cURL
Python
Node.js
Go
curl -X POST 'https://api.webit.live/api/v1/realtime/web' \
--header 'Authorization: Basic <credential string>' \
--header 'Content-Type: application/json' \
--data-raw '{
"url": "https://www.example.com",
"render": true,
"format": "json",
"parse": true,
"parser": {
"template_one": {
"type": "item",
"selectors": [".css-selector"],
"extractor": "text"
},
"template_two": {
"type": "list",
"selectors": ["h1,h2,h3"]
}
}
}'
import requests
url = 'https://api.webit.live/api/v1/realtime/web'
headers = {
'Authorization': 'Basic <credential string>',
'Content-Type': 'application/json'
}
data = {
"url": "https://www.example.com",
"render": True,
"format": "json",
"parse": True,
"parser": {
"template_one": {
"type": "item",
"selectors": [".css-selector"],
"extractor": "text"
},
"template_two": {
"type": "list",
"selectors": ["h1,h2,h3"]
}
}
}
response = requests.post(url, headers=headers, json=data)
print(response.status_code)
print(response.json())
const axios = require('axios');
const url = 'https://api.webit.live/api/v1/realtime/web';
const headers = {
'Authorization': 'Basic <credential string>',
'Content-Type': 'application/json'
};
const data = {
"url": "https://www.example.com",
"render": true,
"format": "json",
"parse": true,
"parser": {
"template_one": {
"type": "item",
"selectors": [".css-selector"],
"extractor": "text"
},
"template_two": {
"type": "list",
"selectors": ["h1,h2,h3"]
}
}
};
axios.post(url, data, { headers })
.then(response => {
console.log(response.status);
console.log(response.data);
})
.catch(error => {
console.error(error);
});
package main
import (
"bytes"
"encoding/base64"
"fmt"
"net/http"
"encoding/json"
)
func main() {
url := "https://api.webit.live/api/v1/realtime/web"
payload := []byte(`{
"url": "https://www.example.com",
"render": true,
"format": "json",
"parse": true,
"parser": {
"template_one": {
"type": "item",
"selectors": [".css-selector"],
"extractor": "text"
},
"template_two": {
"type": "list",
"selectors": ["h1,h2,h3"]
}
}
}`)
headers := map[string]string{
"Authorization": "Basic <credential string>",
"Content-Type": "application/json",
}
req, err := http.NewRequest("POST", url, bytes.NewBuffer(payload))
if err != nil {
fmt.Println(err)
return
}
for key, value := range headers {
req.Header.Set(key, value)
}
client := &http.Client{}
resp, err := client.Do(req)
if err != nil {
fmt.Println(err)
return
}
defer resp.Body.Close()
fmt.Println(resp.StatusCode)
// Read the response body if needed
// body, err := ioutil.ReadAll(resp.Body)
// fmt.Println(string(body))
}

Parsing template syntax

At its core, a parsing template is built of three properties:
  • type (required) - defines the format of the returned data. For example, setting type to json will instructed the parser to structure the extract data into JSON, and then return the JSON object.
  • selectors (required) - The CSS selector or selectors of the elements that should be extracted by the parser. Listing more than one selector creates fallback selectors, meaning that if the first selector isn’t found, the parser will look for the second, then the third, etc.
  • extractor - Once an element has been identified by its selector, the extractor defines what part of the element should be returned.

Types

Types define the return format of extracted data. Types are a required field, and can have the following values:
Value
Description
item (Default)
Returns the contents of the first element matched by the defined CSS selector.
list
Returns a list of data points from all the matching elements of the CSS selector in a list.
json
Returns the contents of the first element (like item), but formatted into JSON.
table
Converts an HTML table into JSON, using the headers of the table as keys. Use this type when targeting <table></table> elements.
object
Define a custom object that is populated and returned. The structure/properties of the object are defined using fields (see below). The object will be populated using the first element that matches the defined CSS selector.
object-list
Returns a list of objects, populated by all the elements that match the defined CSS selector. The structure/properties of objects are defined using fields (see below).
Example - list
...
"parser": {
"template_name": {
"type": "list",
"selectors": ["h1,h2,h3"],
"extractor": "text"
}
}
Example - json
...
"parser": {
"template_name": {
"type": "json",
"selectors": ["script[type='application/json']"]
}
}
Example - table
...
"parser": {
"template_name": {
"type": "table",
"selectors": [".someTable"]
}
}

Extractors

When an element matches the defined CSS selectors, the extractor defines which part of the matched element is extracted and returned. Extractors can have three possible values:
Name
Description
text (default)
Extracts the text of the selected element.
html
Extracts the full inner HTML of the selected element.
[attribute-name]
Extracts the value of an HTML attribute of the selected element (eg: id, href, etc.)
Extractors can only be used when type is set to one of:
  • item
  • list
  • json
Tables do not use extractors because the structure of the table defines the way the table is parsed. An object, and by extension object-lists, uses “fields” to define the data to be extracted.
Examples
Let's assume the page being parsed is made up of the following HTML:
<html>
<head>
<title>parsing demo</title>
</head>
<body>
<div class="article">
<p>
Lorem ipsum dolor sit amet <span>consectetur adipiscing elit.</span> Duis sapien eros, euismod vel magna sodales,
porttitor tristique mi. Phasellus vel lobortis mi,
<a href=\"https://www.somedomain.com\">nec pharetra risus.</a>
Sed quis augue in ligula blandit ullamcorper non et elit.
</p>
</div>
</body>
</html>
In the below parsing template example, the first template “link” searches for the first link in the page (an element matching the “a” selector), and then uses the [attribute-name] extractor to extract the URL to which the link is pointing.
The second template looks for a div with the class “article” and extracts the full html contents.
...
"parser": {
"link": {
"selectors": ["a"],
"extractor": "[href]"
},
"article": {
"selectors": ["div.article"],
"extractor": "html"
}
}
The response for this parsing template processing our example HTML would produce the following output:
{
...
"link": "https://www.somedomain.com",
"article": "<p>Lorem ipsum dolor sit amet, <span>consectetur adipiscing elit.</span> Duis sapien eros, euismod vel magna sodales, porttitor tristique mi. Phasellus vel lobortis mi, <a href=\"https://www.somedomain.com\">nec pharetra risus.</a> Sed quis augue in ligula blandit ullamcorper non et elit. </p>"
}

Objects

Objects allow users to define a customized return structure that can capture data in a way that is more accessible and better represents the data they are trying to collect. For example, when collecting product data, a “product” object can be created, with fields for price, inventory, color, shipping method, and other contextually relevant factors.
Because objects are fully user-defined, different objects can be created for different sources, purposes, or any other use case!
An object has a type, which is always set to object, and selectors, which define the scope or parent element from which fields (which each have their own selectors) select from.
Fields make up the contents of the object, and each one has a title and a selector. The title defines the name of the field, and the selector defines the CSS selector that should be used to populate its value, where that element is a child of the object selector. For example:
...
"product": {
"type": "object",
"selectors": [ ".product-card" ],
"fields": {
"name": {
"selectors": [ ".name" ]
},
"price": {
"selectors": [ ".price" ]
},
"average_rating": {
"selectors": [ ".rating" ]
}
}
}
In the above example, an object named “product” would be returned. It would have three children, “name”, “price”, and “average_rating”. The value for name would be extracted from the first element found with the class “name”, where that element is itself a child of the first element found with the class “product-card”.
The above object parsing template would parse this HTML:
<html>
<head>
<title>parsing demo</title>
</head>
<body>
<div class="product-card">
<div class="name">blue jeans</div>
<div class="price">$50</div>
<div class="rating">4/5</div>
</div>
</body>
</html>
into this output:
{
...
"product":{
"name": "blue jeans",
"price": "$50",
"average_rating": "4/5"
}
}

Object lists

An object list combines objects with lists, allowing users to create custom structures that are populated by multiple matching elements. This can be useful, for example, when collecting product data from a page that has multiple products, or to quickly extract SERP listings.
An object-list uses syntax that is very similar to a regular object, except that type is set to object-list instead of object. For example:
{
"links": {
"type": "object-list",
"selectors": ["a"],
"fields": {
"url": {
"selectors": ["*"],
"extractor": "[href]"
},
"title": {
"selectors": ["*"],
"extractor": "text"
}
}
}
}
The above example would parse out all of the links (all of the elements that have an "a" tag) in a webpage. For the following HTML webpage:
<html>
<head>
<title>parsing demo</title>
</head>
<body>
<a href="https://www.somelink.com">Some link</a>
<a href="https://www.anotherlink.com">Another lin</a>
</body>
</html>
The output of our object-list template would be:
{
...
"links":[
{
"url": "https://www.somelink.com",
"title": "Some link"
},
{
"url": "https://www.anotherlink.com",
"title": "Another link"
},
...
]
}

Implementing parsing templates

Parsing templates can be implemented in one of two ways:
  • Inline - The parsing template’s rules are defined within the request body. All of the previous examples shown above have been inline implementations.
  • Upload - Custom parsers can be written separately and uploaded to the WebAPI, and then implemented by passing a parser value instead of being written inline.
We highly recommend uploading your parsing template to increase stability and performance. Parsing templates should only be used inline for testing and development purposes.

Managing parsing templates

Parsing templates can be managed through several API endpoints that allow uploading, viewing, updating, and deleting parsing templates.
post
https://api.webit.live/api/v1/parsers
Upload a new parsing template
get
https://api.webit.live/api/v1/parsers/{parsing-template-name}
View a parsing template
get
https://api.webit.live/api/v1/parsers
List all uploaded parsing templates
delete
https://api.webit.live/api/v1/parsers/{parsing-template-name}
Delete a parsing template
put
https://api.webit.live/api/v1/parsers
Update a parsing template