> ## Documentation Index
> Fetch the complete documentation index at: https://docs.notte.cc/llms.txt
> Use this file to discover all available pages before exploring further.

# Scrape Webpage



## OpenAPI

````yaml https://api.notte.cc/openapi.json post /scrape
openapi: 3.1.0
info:
  title: Notte API
  description: >-
    Notte API is a REST API that allows you to interact with Notte. It is used
    to create cloud browser sessions, scrape webpages, and run web ai agents to
    act on your behalf on the internet.
  version: 1.4.40
  x-logo:
    url: https://www.notte.cc/images/logo/logo-white.png
servers: []
security: []
tags:
  - name: agents
    description: Web AI agents (start, stop, status, replay, etc.)
  - name: sessions
    description: Session management (start, stop, status, etc.)
  - name: debug
    description: Session debugging tools (replay,logs, recording, etc.)
  - name: page
    description: Page operations withing a session (observe, step, scrape, etc.)
  - name: storage
    description: File storage interface (upload, download, list, etc.)
  - name: network
    description: Network requests/responses withing a session (intercept, etc.)
  - name: vaults
    description: >-
      Vault & Credentials management (create/delete vaults, create/delete
      credentials, etc.)
  - name: personas
    description: Persona management (create, delete, list emails, list sms, etc.)
  - name: scrape
    description: >-
      Webpage scraping (scrape, screenshot, etc.) with automatic session
      management.
  - name: health
    description: Health check endpoint.
  - name: usage
    description: Usage logs (usage, logs, etc.)
  - name: functions
    description: Functions management (create, delete, list, etc.)
paths:
  /scrape:
    post:
      tags:
        - scrape
      summary: Scrape Webpage
      operationId: scrape_webpage
      parameters:
        - name: x-notte-request-origin
          in: header
          required: false
          schema:
            anyOf:
              - type: string
              - type: 'null'
            title: X-Notte-Request-Origin
        - name: x-notte-sdk-version
          in: header
          required: false
          schema:
            anyOf:
              - type: string
              - type: 'null'
            title: X-Notte-Sdk-Version
      requestBody:
        required: true
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/GlobalScrapeRequest'
      responses:
        '200':
          description: Successful Response
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/DataSpace'
        '422':
          description: Validation Error
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/HTTPValidationError'
      security:
        - OAuth2PasswordBearer: []
      x-codeSamples:
        - lang: Python
          source: |-
            from notte_sdk import NotteClient
            from pydantic import BaseModel


            class CompanyInfo(BaseModel):
                name: str
                foundation_year: int
                location: str


            notte = NotteClient()
            company_info = notte.scrape(
                url="https://www.ycombinator.com/companies/nottelabs",
                instructions="Extract the company info from the webpage",
                response_format=CompanyInfo,
                only_main_content=False,
            )
          label: python
        - lang: JavaScript
          source: |-
            import { NotteClient } from "notte-sdk";
            import { z } from "zod";

            const CompanyInfo = z.object({
              name: z.string(),
              foundation_year: z.number(),
              location: z.string(),
            });

            const notte = new NotteClient();
            const companyInfo = await notte.scrape(
              "https://www.ycombinator.com/companies/nottelabs",
              {
                instructions: "Extract the company info from the webpage",
                response_format: CompanyInfo,
                only_main_content: false,
              }
            );
          label: node
        - lang: Curl
          source: |-
            curl -X POST "https://api.notte.cc/scrape" \
            -H "Authorization: Bearer $NOTTE_API_KEY" \
            -H "Content-Type: application/json" \
            -d '{
            "headless": true,
            "solve_captchas": false,
            "timeout_minutes": 3,
            "proxies": false,
            "browser_type": "chromium",
            "user_agent": null,
            "chrome_args": null,
            "viewport_width": null,
            "viewport_height": null,
            "cdp_url": null,
            "use_file_storage": false,
            "screenshot_type": "last_action",
            "scrape_links": true,
            "scrape_images": false,
            "ignored_tags": null,
            "only_main_content": true,
            "only_images": false,
            "response_format": null,
            "instructions": null,
            "use_link_placeholders": false,
            "url": "www.google.com"
            }'
          label: curl
components:
  schemas:
    GlobalScrapeRequest:
      properties:
        headless:
          type: boolean
          title: Headless
          description: Whether to run the session in headless mode.
          default: true
        solve_captchas:
          type: boolean
          title: Solve Captchas
          description: Whether to try to automatically solve captchas
          default: false
        max_duration_minutes:
          type: integer
          maximum: 15
          exclusiveMinimum: 0
          title: Max Duration Minutes
          description: >-
            Maximum session lifetime in minutes (absolute maximum, not affected
            by activity).
          default: 15
        idle_timeout_minutes:
          type: integer
          maximum: 15
          exclusiveMinimum: 0
          title: Idle Timeout Minutes
          description: >-
            Idle timeout in minutes. Session closes after this period of
            inactivity (resets on each operation).
          default: 3
        proxies:
          anyOf:
            - items:
                oneOf:
                  - $ref: '#/components/schemas/NotteProxy'
                  - $ref: '#/components/schemas/ExternalProxy'
                  - $ref: '#/components/schemas/TailnetProxy'
                discriminator:
                  propertyName: type
                  mapping:
                    external:
                      $ref: '#/components/schemas/ExternalProxy'
                    notte:
                      $ref: '#/components/schemas/NotteProxy'
                    tailnet:
                      $ref: '#/components/schemas/TailnetProxy'
              type: array
            - type: boolean
          title: Proxies
          description: >-
            List of custom proxies to use for the session. If True, the default
            proxies will be used.
          default: false
        browser_type:
          type: string
          enum:
            - chromium
            - chrome
            - firefox
            - chrome-nightly
            - chrome-turbo
          title: Browser Type
          description: The browser type to use. Can be chromium, chrome or firefox.
          default: chromium
        user_agent:
          anyOf:
            - type: string
            - type: 'null'
          title: User Agent
          description: The user agent to use for the session
        chrome_args:
          anyOf:
            - items:
                type: string
              type: array
            - type: 'null'
          title: Chrome Args
          description: Overwrite the chrome instance arguments
        viewport_width:
          anyOf:
            - type: integer
            - type: 'null'
          title: Viewport Width
          description: The width of the viewport
        viewport_height:
          anyOf:
            - type: integer
            - type: 'null'
          title: Viewport Height
          description: The height of the viewport
        aspect_ratio:
          anyOf:
            - type: string
              enum:
                - '5:4'
                - '16:9'
            - type: 'null'
          title: Aspect Ratio
          description: >-
            Viewport shape preset. When set, the backend fits the largest
            rectangle of this aspect ratio inside the sampled available screen
            area. Cannot be combined with explicit
            viewport_width/viewport_height.
        cdp_url:
          anyOf:
            - type: string
            - type: 'null'
          title: Cdp Url
          description: The CDP URL of another remote session provider.
        use_file_storage:
          type: boolean
          title: Use File Storage
          description: Whether FileStorage should be attached to the session.
          default: true
        screenshot_type:
          type: string
          enum:
            - raw
            - full
            - last_action
          title: Screenshot Type
          description: The type of screenshot to use for the session.
          default: last_action
        profile:
          anyOf:
            - $ref: '#/components/schemas/SessionProfile'
            - type: 'null'
          description: Browser profile configuration for state persistence
        web_bot_auth:
          type: boolean
          title: Web Bot Auth
          description: Whether to use web bot authentication.
          default: false
        extra_http_headers:
          anyOf:
            - additionalProperties:
                type: string
              type: object
            - type: 'null'
          title: Extra Http Headers
          description: Extra HTTP headers to be sent with every request.
        vault_id:
          anyOf:
            - type: string
            - type: 'null'
          title: Vault Id
          description: The vault to use for the session
        selector:
          anyOf:
            - type: string
            - type: 'null'
          title: Selector
          description: >-
            Playwright selector to scope the scrape to. Only content inside this
            selector will be scraped.
        scrape_links:
          type: boolean
          title: Scrape Links
          description: Whether to scrape links from the page. Links are scraped by default.
          default: true
        scrape_images:
          type: boolean
          title: Scrape Images
          description: >-
            Whether to scrape images from the page. Images are scraped by
            default.
          default: false
        ignored_tags:
          anyOf:
            - items:
                type: string
              type: array
            - type: 'null'
          title: Ignored Tags
          description: HTML tags to ignore from the page
        only_main_content:
          type: boolean
          title: Only Main Content
          description: >-
            Whether to only scrape the main content of the page. If True,
            navbars, footers, etc. are excluded.
          default: false
        only_images:
          type: boolean
          title: Only Images
          description: >-
            Whether to only scrape images from the page. If True, the page
            content is excluded.
          default: false
        response_format:
          anyOf:
            - {}
            - type: 'null'
          title: Response Format
          description: >-
            The response format to use for the scrape. You can use a Pydantic
            model or a JSON Schema dict (cf.
            https://docs.pydantic.dev/latest/concepts/json_schema/#generating-json-schema.)
        instructions:
          anyOf:
            - type: string
            - type: 'null'
          title: Instructions
          description: >-
            Additional instructions to use for the scrape. E.g. 'Extract only
            the title, date and content of the articles.'
        use_link_placeholders:
          type: boolean
          title: Use Link Placeholders
          description: >-
            Whether to use link/image placeholders to reduce the number of
            tokens in the prompt and hallucinations. However this is an
            experimental feature and might not work as expected.
          default: false
        url:
          type: string
          title: Url
      additionalProperties: false
      type: object
      required:
        - url
      title: GlobalScrapeRequest
    DataSpace:
      properties:
        markdown:
          type: string
          title: Markdown
          description: Markdown representation of the extracted data
        images:
          anyOf:
            - items:
                $ref: '#/components/schemas/ImageData'
              type: array
            - type: 'null'
          title: Images
          description: List of images extracted from the page (ID and download link)
        structured:
          anyOf:
            - $ref: '#/components/schemas/StructuredData_BaseModel_'
            - type: 'null'
          description: Structured data extracted from the page in JSON format
      type: object
      required:
        - markdown
      title: DataSpace
    HTTPValidationError:
      properties:
        detail:
          items:
            $ref: '#/components/schemas/ValidationError'
          type: array
          title: Detail
      type: object
      title: HTTPValidationError
    NotteProxy:
      properties:
        type:
          type: string
          const: notte
          title: Type
          default: notte
        id:
          anyOf:
            - type: string
            - type: 'null'
          title: Id
        country:
          anyOf:
            - $ref: '#/components/schemas/ProxyGeolocationCountry'
            - type: 'null'
      additionalProperties: false
      type: object
      title: NotteProxy
    ExternalProxy:
      properties:
        type:
          type: string
          const: external
          title: Type
          default: external
        server:
          type: string
          title: Server
        username:
          anyOf:
            - type: string
            - type: 'null'
          title: Username
        password:
          anyOf:
            - type: string
            - type: 'null'
          title: Password
        bypass:
          anyOf:
            - type: string
            - type: 'null'
          title: Bypass
      additionalProperties: false
      type: object
      required:
        - server
      title: ExternalProxy
    TailnetProxy:
      properties:
        type:
          type: string
          const: tailnet
          title: Type
          default: tailnet
        oauth_client_id:
          type: string
          title: Oauth Client Id
        oauth_client_secret:
          anyOf:
            - type: string
            - type: 'null'
          title: Oauth Client Secret
      additionalProperties: false
      type: object
      required:
        - oauth_client_id
      title: TailnetProxy
    SessionProfile:
      properties:
        id:
          type: string
          title: Id
          description: Profile ID to use for this session
        persist:
          type: boolean
          title: Persist
          description: Whether to save browser state to profile on session close
          default: false
      additionalProperties: false
      type: object
      required:
        - id
      title: SessionProfile
    ImageData:
      properties:
        url:
          anyOf:
            - type: string
            - type: 'null'
          title: Url
          description: URL of the image
        category:
          anyOf:
            - $ref: '#/components/schemas/ImageCategory'
            - type: 'null'
          description: Category of the image (icon, svg, content, etc.)
        description:
          anyOf:
            - type: string
            - type: 'null'
          title: Description
          description: Description of the image
      type: object
      title: ImageData
    StructuredData_BaseModel_:
      properties:
        success:
          type: boolean
          title: Success
          description: Whether the data was extracted successfully
          default: true
        error:
          anyOf:
            - type: string
            - type: 'null'
          title: Error
          description: Error message if the data was not extracted successfully
        data:
          anyOf:
            - $ref: '#/components/schemas/BaseModel'
            - $ref: >-
                #/components/schemas/RootModel_Union_dict_str__Any___list_dict_str__Any____
            - type: 'null'
          title: Data
          description: Structured data extracted from the page in JSON format
      type: object
      title: StructuredData[BaseModel]
    ValidationError:
      properties:
        loc:
          items:
            anyOf:
              - type: string
              - type: integer
          type: array
          title: Location
        msg:
          type: string
          title: Message
        type:
          type: string
          title: Error Type
        input:
          title: Input
        ctx:
          type: object
          title: Context
      type: object
      required:
        - loc
        - msg
        - type
      title: ValidationError
    ProxyGeolocationCountry:
      type: string
      enum:
        - ad
        - ae
        - af
        - ag
        - ai
        - al
        - am
        - ao
        - ar
        - at
        - au
        - aw
        - az
        - ba
        - bb
        - bd
        - be
        - bf
        - bg
        - bh
        - bi
        - bj
        - bm
        - bn
        - bo
        - bq
        - br
        - bs
        - bt
        - bw
        - by
        - bz
        - ca
        - cd
        - cg
        - ch
        - ci
        - cl
        - cm
        - cn
        - co
        - cr
        - cu
        - cv
        - cw
        - cy
        - cz
        - de
        - dj
        - dk
        - dm
        - do
        - dz
        - ec
        - ee
        - eg
        - es
        - et
        - fi
        - fj
        - fr
        - ga
        - gb
        - gd
        - ge
        - gf
        - gg
        - gh
        - gi
        - gm
        - gn
        - gp
        - gq
        - gr
        - gt
        - gu
        - gw
        - gy
        - hk
        - hn
        - hr
        - ht
        - hu
        - id
        - ie
        - il
        - im
        - in
        - iq
        - ir
        - is
        - it
        - je
        - jm
        - jo
        - jp
        - ke
        - kg
        - kh
        - kn
        - kr
        - kw
        - ky
        - kz
        - la
        - lb
        - lc
        - lk
        - lr
        - ls
        - lt
        - lu
        - lv
        - ly
        - ma
        - md
        - me
        - mf
        - mg
        - mk
        - ml
        - mm
        - mn
        - mo
        - mq
        - mr
        - mt
        - mu
        - mv
        - mw
        - mx
        - my
        - mz
        - na
        - nc
        - ne
        - ng
        - ni
        - nl
        - 'no'
        - np
        - nz
        - om
        - pa
        - pe
        - pf
        - pg
        - ph
        - pk
        - pl
        - pr
        - ps
        - pt
        - py
        - qa
        - re
        - ro
        - rs
        - ru
        - rw
        - sa
        - sc
        - sd
        - se
        - sg
        - si
        - sk
        - sl
        - sm
        - sn
        - so
        - sr
        - ss
        - st
        - sv
        - sx
        - sy
        - sz
        - tc
        - tg
        - th
        - tj
        - tm
        - tn
        - tr
        - tt
        - tw
        - tz
        - ua
        - ug
        - us
        - uy
        - uz
        - vc
        - ve
        - vg
        - vi
        - vn
        - ye
        - za
        - zm
        - zw
      title: ProxyGeolocationCountry
    ImageCategory:
      type: string
      enum:
        - favicon
        - icon
        - content_image
        - decorative
        - svg_icon
        - svg_content
      title: ImageCategory
    BaseModel:
      properties: {}
      type: object
      title: BaseModel
    RootModel_Union_dict_str__Any___list_dict_str__Any____:
      anyOf:
        - additionalProperties: true
          type: object
        - items:
            additionalProperties: true
            type: object
          type: array
      title: RootModel[Union[dict[str, Any], list[dict[str, Any]]]]
  securitySchemes:
    OAuth2PasswordBearer:
      type: oauth2
      flows:
        password:
          scopes: {}
          tokenUrl: token

````