> ## Documentation Index
> Fetch the complete documentation index at: https://docs.notte.cc/llms.txt
> Use this file to discover all available pages before exploring further.

# Page Scrape


## OpenAPI

````yaml https://api.notte.cc/openapi.json post /sessions/{session_id}/page/scrape
openapi: 3.1.0
info:
  title: Notte API
  description: >-
    Notte API is a REST API that allows you to interact with Notte. It is used
    to create cloud browser sessions, scrape webpages, and run web ai agents to
    act on your behalf on the internet.
  version: 1.4.40
  x-logo:
    url: https://www.notte.cc/images/logo/logo-white.png
servers: []
security: []
tags:
  - name: agents
    description: Web AI agents (start, stop, status, replay, etc.)
  - name: sessions
    description: Session management (start, stop, status, etc.)
  - name: debug
    description: Session debugging tools (replay,logs, recording, etc.)
  - name: page
    description: Page operations withing a session (observe, step, scrape, etc.)
  - name: storage
    description: File storage interface (upload, download, list, etc.)
  - name: network
    description: Network requests/responses withing a session (intercept, etc.)
  - name: vaults
    description: >-
      Vault & Credentials management (create/delete vaults, create/delete
      credentials, etc.)
  - name: personas
    description: Persona management (create, delete, list emails, list sms, etc.)
  - name: scrape
    description: >-
      Webpage scraping (scrape, screenshot, etc.) with automatic session
      management.
  - name: health
    description: Health check endpoint.
  - name: usage
    description: Usage logs (usage, logs, etc.)
  - name: functions
    description: Functions management (create, delete, list, etc.)
paths:
  /sessions/{session_id}/page/scrape:
    post:
      tags:
        - sessions
        - page
      summary: Page Scrape
      operationId: page_scrape
      parameters:
        - name: session_id
          in: path
          required: true
          schema:
            type: string
            title: Session Id
        - name: update_metadata
          in: query
          required: false
          schema:
            type: boolean
            default: true
            title: Update Metadata
        - name: x-notte-request-origin
          in: header
          required: false
          schema:
            anyOf:
              - type: string
              - type: 'null'
            title: X-Notte-Request-Origin
        - name: x-notte-sdk-version
          in: header
          required: false
          schema:
            anyOf:
              - type: string
              - type: 'null'
            title: X-Notte-Sdk-Version
      requestBody:
        required: true
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/ScrapeRequest'
      responses:
        '200':
          description: Successful Response
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/DataSpace'
        '422':
          description: Validation Error
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/HTTPValidationError'
      security:
        - OAuth2PasswordBearer: []
      x-codeSamples:
        - lang: Python
          source: |-
            from notte_sdk import NotteClient, actions
            from pydantic import BaseModel


            class StoryInfo(BaseModel):
                title: str
                points: int | None = None


            class Stories(BaseModel):
                stories: list[StoryInfo]


            notte = NotteClient()
            with notte.Session() as session:
                _ = session.execute(actions.Goto(url="https://news.ycombinator.com"))
                stories = session.scrape(
                    instructions="Extract the top 5 story titles and their points",
                    response_format=Stories,
                )
          label: python
        - lang: JavaScript
          source: |-
            import { NotteClient } from "notte-sdk";
            import { z } from "zod";

            const StoryInfo = z.object({
              title: z.string(),
              points: z.number().nullable(),
            });

            const Stories = z.object({
              stories: z.array(StoryInfo),
            });

            const notte = new NotteClient();
            await notte.Session().use(async (session) => {
              await session.execute({
                type: "goto",
                url: "https://news.ycombinator.com",
              });
              const stories = await session.scrape({
                instructions:
                  "Extract the top 5 story titles and their points",
                response_format: Stories,
              });
            });
          label: node
        - lang: Curl
          source: >-
            curl -X POST "https://api.notte.cc/sessions/$session_id/page/scrape"
            \

            -H "Authorization: Bearer $NOTTE_API_KEY" \

            -H "Content-Type: application/json" \

            -d '{

            "scrape_links": true,

            "scrape_images": false,

            "ignored_tags": null,

            "only_main_content": true,

            "only_images": false,

            "response_format": null,

            "instructions": null,

            "use_link_placeholders": false

            }'
          label: curl
components:
  schemas:
    ScrapeRequest:
      properties:
        selector:
          anyOf:
            - type: string
            - type: 'null'
          title: Selector
          description: >-
            Playwright selector to scope the scrape to. Only content inside this
            selector will be scraped.
        scrape_links:
          type: boolean
          title: Scrape Links
          description: Whether to scrape links from the page. Links are scraped by default.
          default: true
        scrape_images:
          type: boolean
          title: Scrape Images
          description: >-
            Whether to scrape images from the page. Images are scraped by
            default.
          default: false
        ignored_tags:
          anyOf:
            - items:
                type: string
              type: array
            - type: 'null'
          title: Ignored Tags
          description: HTML tags to ignore from the page
        only_main_content:
          type: boolean
          title: Only Main Content
          description: >-
            Whether to only scrape the main content of the page. If True,
            navbars, footers, etc. are excluded.
          default: false
        only_images:
          type: boolean
          title: Only Images
          description: >-
            Whether to only scrape images from the page. If True, the page
            content is excluded.
          default: false
        response_format:
          anyOf:
            - {}
            - type: 'null'
          title: Response Format
          description: >-
            The response format to use for the scrape. You can use a Pydantic
            model or a JSON Schema dict (cf.
            https://docs.pydantic.dev/latest/concepts/json_schema/#generating-json-schema.)
        instructions:
          anyOf:
            - type: string
            - type: 'null'
          title: Instructions
          description: >-
            Additional instructions to use for the scrape. E.g. 'Extract only
            the title, date and content of the articles.'
        use_link_placeholders:
          type: boolean
          title: Use Link Placeholders
          description: >-
            Whether to use link/image placeholders to reduce the number of
            tokens in the prompt and hallucinations. However this is an
            experimental feature and might not work as expected.
          default: false
      additionalProperties: false
      type: object
      title: ScrapeRequest
    DataSpace:
      properties:
        markdown:
          type: string
          title: Markdown
          description: Markdown representation of the extracted data
        images:
          anyOf:
            - items:
                $ref: '#/components/schemas/ImageData'
              type: array
            - type: 'null'
          title: Images
          description: List of images extracted from the page (ID and download link)
        structured:
          anyOf:
            - $ref: '#/components/schemas/StructuredData_BaseModel_'
            - type: 'null'
          description: Structured data extracted from the page in JSON format
      type: object
      required:
        - markdown
      title: DataSpace
    HTTPValidationError:
      properties:
        detail:
          items:
            $ref: '#/components/schemas/ValidationError'
          type: array
          title: Detail
      type: object
      title: HTTPValidationError
    ImageData:
      properties:
        url:
          anyOf:
            - type: string
            - type: 'null'
          title: Url
          description: URL of the image
        category:
          anyOf:
            - $ref: '#/components/schemas/ImageCategory'
            - type: 'null'
          description: Category of the image (icon, svg, content, etc.)
        description:
          anyOf:
            - type: string
            - type: 'null'
          title: Description
          description: Description of the image
      type: object
      title: ImageData
    StructuredData_BaseModel_:
      properties:
        success:
          type: boolean
          title: Success
          description: Whether the data was extracted successfully
          default: true
        error:
          anyOf:
            - type: string
            - type: 'null'
          title: Error
          description: Error message if the data was not extracted successfully
        data:
          anyOf:
            - $ref: '#/components/schemas/BaseModel'
            - $ref: '#/components/schemas/RootModel_Any_'
            - type: 'null'
          title: Data
          description: Structured data extracted from the page in JSON format
      type: object
      title: StructuredData[BaseModel]
    ValidationError:
      properties:
        loc:
          items:
            anyOf:
              - type: string
              - type: integer
          type: array
          title: Location
        msg:
          type: string
          title: Message
        type:
          type: string
          title: Error Type
        input:
          title: Input
        ctx:
          type: object
          title: Context
      type: object
      required:
        - loc
        - msg
        - type
      title: ValidationError
    ImageCategory:
      type: string
      enum:
        - favicon
        - icon
        - content_image
        - decorative
        - svg_icon
        - svg_content
      title: ImageCategory
    BaseModel:
      properties: {}
      type: object
      title: BaseModel
    RootModel_Any_:
      title: RootModel[Any]
  securitySchemes:
    OAuth2PasswordBearer:
      type: oauth2
      flows:
        password:
          scopes: {}
          tokenUrl: token

````